From bf668e99662d0692faa3ad3e57b10ad5d890c722 Mon Sep 17 00:00:00 2001
From: Azim Afroozeh <afroozeh3@gmail.com>
Date: Thu, 9 May 2024 19:11:25 +0200
Subject: [PATCH] init V_0_1

---
 .github/workflows/CI.yaml                     |   165 +
 .gitignore                                    |     6 +
 CMakeLists.txt                                |   112 +
 LICENSE                                       |    21 +
 README.md                                     |    40 +
 crystal-opt/CMakeLists.txt                    |     3 +
 crystal-opt/README.md                         |    29 +
 crystal-opt/src/CMakeLists.txt                |    43 +
 crystal-opt/src/crystal/crystal.cuh           |    32 +
 crystal-opt/src/crystal/join.cuh              |   333 +
 crystal-opt/src/crystal/load.cuh              |   166 +
 crystal-opt/src/crystal/pred.cuh              |   357 +
 crystal-opt/src/crystal/reduce.cuh            |    75 +
 crystal-opt/src/crystal/store.cuh             |   120 +
 crystal-opt/src/crystal/term.cuh              |    33 +
 crystal-opt/src/ops/join.cu                   |   242 +
 crystal-opt/src/ops/project.cu                |   198 +
 crystal-opt/src/ops/utils/generator.h         |   399 +
 crystal-opt/src/ops/utils/gpu_utils.h         |    57 +
 crystal-opt/src/ssb/all.cu                    |  2734 ++
 crystal-opt/src/ssb/gpu_utils.h               |    57 +
 crystal-opt/src/ssb/q11.cu                    |   204 +
 crystal-opt/src/ssb/q12.cu                    |   206 +
 crystal-opt/src/ssb/q13.cu                    |   207 +
 crystal-opt/src/ssb/q21.cu                    |   336 +
 crystal-opt/src/ssb/q22.cu                    |   334 +
 crystal-opt/src/ssb/q23.cu                    |   326 +
 crystal-opt/src/ssb/q31.cu                    |   349 +
 crystal-opt/src/ssb/q32.cu                    |   343 +
 crystal-opt/src/ssb/q33.cu                    |   317 +
 crystal-opt/src/ssb/q34.cu                    |   365 +
 crystal-opt/src/ssb/q41.cu                    |   438 +
 crystal-opt/src/ssb/q42.cu                    |   411 +
 crystal-opt/src/ssb/q43.cu                    |   405 +
 crystal-opt/src/ssb/ssb_utils.h               |   177 +
 crystal/CMakeLists.txt                        |     3 +
 crystal/LICENSE                               |    21 +
 crystal/README.md                             |    79 +
 crystal/src/CMakeLists.txt                    |    43 +
 crystal/src/crystal/crystal.cuh               |     9 +
 crystal/src/crystal/join.cuh                  |   311 +
 crystal/src/crystal/load.cuh                  |    97 +
 crystal/src/crystal/pred.cuh                  |   335 +
 crystal/src/crystal/reduce.cuh                |    53 +
 crystal/src/crystal/store.cuh                 |    98 +
 crystal/src/ops/join.cu                       |   220 +
 crystal/src/ops/project.cu                    |   176 +
 crystal/src/ops/utils/generator.h             |   377 +
 crystal/src/ops/utils/gpu_utils.h             |    35 +
 crystal/src/ssb/gpu_utils.h                   |    35 +
 crystal/src/ssb/q11.cu                        |   168 +
 crystal/src/ssb/q12.cu                        |   167 +
 crystal/src/ssb/q13.cu                        |   167 +
 crystal/src/ssb/q21.cu                        |   286 +
 crystal/src/ssb/q22.cu                        |   286 +
 crystal/src/ssb/q23.cu                        |   279 +
 crystal/src/ssb/q31.cu                        |   296 +
 crystal/src/ssb/q32.cu                        |   290 +
 crystal/src/ssb/q33.cu                        |   291 +
 crystal/src/ssb/q34.cu                        |   316 +
 crystal/src/ssb/q41.cu                        |   371 +
 crystal/src/ssb/q42.cu                        |   346 +
 crystal/src/ssb/q43.cu                        |   343 +
 crystal/src/ssb/ssb_utils.h                   |   107 +
 data/README.md                                |    17 +
 data/result_of_queries/q11                    |     1 +
 data/result_of_queries/q21                    |   281 +
 data/result_of_queries/q31                    |   151 +
 data/result_of_queries/q41                    |    36 +
 data/ssb/.gitignore                           |     4 +
 data/ssb/SSB.md                               |   372 +
 data/ssb/dbgen/.gitignore                     |     6 +
 data/ssb/dbgen/BUGS                           |   987 +
 data/ssb/dbgen/CHANGES                        |    33 +
 data/ssb/dbgen/HISTORY                        |   535 +
 data/ssb/dbgen/PORTING.NOTES                  |   220 +
 data/ssb/dbgen/README                         |    88 +
 data/ssb/dbgen/TPCH_README                    |   425 +
 data/ssb/dbgen/bcd2.c                         |   237 +
 data/ssb/dbgen/bcd2.h                         |    11 +
 data/ssb/dbgen/bcd2.o                         |   Bin 0 -> 4536 bytes
 data/ssb/dbgen/bm_utils.c                     |   589 +
 data/ssb/dbgen/bm_utils.o                     |   Bin 0 -> 12856 bytes
 data/ssb/dbgen/build.c                        |   800 +
 data/ssb/dbgen/build.o                        |   Bin 0 -> 23320 bytes
 data/ssb/dbgen/config.h                       |   179 +
 data/ssb/dbgen/dists.dss                      |   817 +
 data/ssb/dbgen/driver.c                       |  1144 +
 data/ssb/dbgen/driver.o                       |   Bin 0 -> 41400 bytes
 data/ssb/dbgen/dss.ddl                        |    70 +
 data/ssb/dbgen/dss.h                          |   610 +
 data/ssb/dbgen/dss.ri                         |   100 +
 data/ssb/dbgen/dsstypes.h                     |   312 +
 data/ssb/dbgen/history.html                   |   586 +
 data/ssb/dbgen/load_stub.c                    |   281 +
 data/ssb/dbgen/load_stub.o                    |   Bin 0 -> 6680 bytes
 data/ssb/dbgen/makefile                       |   127 +
 data/ssb/dbgen/makefile.suite                 |   127 +
 data/ssb/dbgen/makefile_win                   |    85 +
 data/ssb/dbgen/permute.c                      |   175 +
 data/ssb/dbgen/permute.h                      |    47 +
 data/ssb/dbgen/permute.o                      |   Bin 0 -> 3248 bytes
 data/ssb/dbgen/print.c                        |  1006 +
 data/ssb/dbgen/print.o                        |   Bin 0 -> 19760 bytes
 data/ssb/dbgen/qgen                           |   Bin 0 -> 78240 bytes
 data/ssb/dbgen/qgen.c                         |   469 +
 data/ssb/dbgen/qgen.o                         |   Bin 0 -> 33552 bytes
 data/ssb/dbgen/rnd.c                          |   262 +
 data/ssb/dbgen/rnd.h                          |    80 +
 data/ssb/dbgen/rnd.o                          |   Bin 0 -> 10608 bytes
 data/ssb/dbgen/shared.h                       |   140 +
 data/ssb/dbgen/speed_seed.c                   |   325 +
 data/ssb/dbgen/speed_seed.o                   |   Bin 0 -> 7776 bytes
 data/ssb/dbgen/tags                           |  1078 +
 data/ssb/dbgen/text.c                         |   313 +
 data/ssb/dbgen/text.o                         |   Bin 0 -> 4312 bytes
 data/ssb/dbgen/tpcd.h                         |   103 +
 data/ssb/dbgen/varsub.c                       |   314 +
 data/ssb/dbgen/varsub.o                       |   Bin 0 -> 17936 bytes
 data/ssb/loader/.metadata                     |   913 +
 data/ssb/loader/Makefile                      |    17 +
 data/ssb/loader/columnSort.c                  |   302 +
 data/ssb/loader/convert.py                    |   106 +
 data/ssb/loader/convert_old.py                |   102 +
 data/ssb/loader/dict.c                        |   235 +
 data/ssb/loader/include/common.h              |   245 +
 data/ssb/loader/include/schema.h              |    77 +
 data/ssb/loader/load.c                        |  1091 +
 data/ssb/loader/load_modified.c               |  1096 +
 data/ssb/loader/rle.c                         |   151 +
 data/ssb/loader/soa.c                         |    89 +
 data/ssb/loader/sort.py                       |    43 +
 data/ssb/loader/sort_other_way.py             |    43 +
 data/ssb/queries/original/load.sql            |     5 +
 data/ssb/queries/original/q11.sql             |     7 +
 data/ssb/queries/original/q12.sql             |     7 +
 data/ssb/queries/original/q13.sql             |     8 +
 data/ssb/queries/original/q21.sql             |    10 +
 data/ssb/queries/original/q22.sql             |    11 +
 data/ssb/queries/original/q23.sql             |    10 +
 data/ssb/queries/original/q31.sql             |    10 +
 data/ssb/queries/original/q32.sql             |    11 +
 data/ssb/queries/original/q33.sql             |    13 +
 data/ssb/queries/original/q34.sql             |    13 +
 data/ssb/queries/original/q41.sql             |    12 +
 data/ssb/queries/original/q42.sql             |    14 +
 data/ssb/queries/original/q43.sql             |    13 +
 data/ssb/queries/original/schema.sql          |    77 +
 data/ssb/queries/transformed/load.sql         |     5 +
 data/ssb/queries/transformed/p1.sql           |    10 +
 data/ssb/queries/transformed/q11.sql          |     6 +
 data/ssb/queries/transformed/q12.sql          |     7 +
 data/ssb/queries/transformed/q13.sql          |     9 +
 data/ssb/queries/transformed/q21.sql          |     9 +
 data/ssb/queries/transformed/q22.sql          |    10 +
 data/ssb/queries/transformed/q23.sql          |     9 +
 data/ssb/queries/transformed/q31.sql          |    10 +
 data/ssb/queries/transformed/q32.sql          |    10 +
 data/ssb/queries/transformed/q33.sql          |    10 +
 data/ssb/queries/transformed/q34.sql          |    10 +
 data/ssb/queries/transformed/q41.sql          |    11 +
 data/ssb/queries/transformed/q42.sql          |    12 +
 data/ssb/queries/transformed/q43.sql          |    12 +
 data/ssb/queries/transformed/schema.sql       |    72 +
 data/ssb/queries/transformed/schema_no_pk.sql |    72 +
 data/util.py                                  |    80 +
 fastlanes/CMakeLists.txt                      |     8 +
 fastlanes/example/CMakeLists.txt              |     6 +
 fastlanes/example/fastlanes_bench_bitpack.cu  |    86 +
 fastlanes/example/fastlanes_bench_delta.cu    |   194 +
 fastlanes/generate.py                         |    15 +
 fastlanes/generated/CMakeLists.txt            |     1 +
 fastlanes/generated/cuda/CMakeLists.txt       |     2 +
 .../cuda/fused_t32_uf1/CMakeLists.txt         |    38 +
 .../cuda_fused_t32_1024_uf1_unpack_bench.cu   |  1047 +
 .../cuda_fused_t32_1024_uf1_unpack_helper.hpp |  2894 ++
 .../cuda_fused_t32_1024_uf1_unpack_src.cu     |     0
 .../cuda_fused_t32_1024_uf1_unpack_test.cu    |   366 +
 .../generated/cuda/fused_t32_uf1/unpack.cmake |    19 +
 .../cuda/normal_t32_uf1/CMakeLists.txt        |    38 +
 .../cuda_normal_t32_1024_uf1_unpack_bench.cu  |  1047 +
 ...cuda_normal_t32_1024_uf1_unpack_helper.hpp |  2899 ++
 .../cuda_normal_t32_1024_uf1_unpack_src.cu    |     0
 .../cuda_normal_t32_1024_uf1_unpack_test.cu   |   366 +
 .../cuda/normal_t32_uf1/unpack.cmake          |    19 +
 fastlanes/generated/generated_files.txt       |     8 +
 fastlanes/generated_files.txt                 |     4 +
 fastlanes/src/CMakeLists.txt                  |    57 +
 fastlanes/src/bitpack_register.cu             |   144 +
 fastlanes/src/bitpack_shared_memory.cu        |   148 +
 fastlanes/src/delta_global_memory.cu          |   160 +
 fastlanes/src/delta_shared_memory.cu          |   180 +
 fastlanes/src/fastlanes_gpu.cpp               |     3 +
 fastlanes/src/include/common.cuh              |    12 +
 fastlanes/src/include/crystal-opt/crystal.cuh |    32 +
 fastlanes/src/include/crystal-opt/join.cuh    |   334 +
 fastlanes/src/include/crystal-opt/load.cuh    |   147 +
 fastlanes/src/include/crystal-opt/pred.cuh    |   459 +
 fastlanes/src/include/crystal-opt/reduce.cuh  |    75 +
 fastlanes/src/include/crystal-opt/store.cuh   |   120 +
 fastlanes/src/include/crystal-opt/term.cuh    |    33 +
 fastlanes/src/include/crystal/crystal.cuh     |     9 +
 fastlanes/src/include/crystal/join.cuh        |   275 +
 fastlanes/src/include/crystal/load.cuh        |   210 +
 fastlanes/src/include/crystal/pred.cuh        |   246 +
 fastlanes/src/include/crystal/reduce.cuh      |    45 +
 fastlanes/src/include/crystal/store.cuh       |    82 +
 fastlanes/src/include/crystal/term.cuh        |    33 +
 fastlanes/src/include/crystal_ssb_utils.h     |   136 +
 fastlanes/src/include/debug.cuh               |    26 +
 fastlanes/src/include/debug.hpp               |    94 +
 fastlanes/src/include/error.cuh               |    26 +
 fastlanes/src/include/fastlanes.cuh           |    12 +
 fastlanes/src/include/fastlanes/join.cuh      |    82 +
 fastlanes/src/include/fastlanes/pred.cuh      |     0
 fastlanes/src/include/fls_gen/macros.hpp      |     4 +
 fastlanes/src/include/fls_gen/pack/pack.hpp   |    27 +
 fastlanes/src/include/fls_gen/rle/rle.hpp     |    45 +
 fastlanes/src/include/fls_gen/rsum/rsum.cuh   |   107 +
 .../include/fls_gen/transpose/transpose.hpp   |    24 +
 .../include/fls_gen/unpack/hardcoded_16.cuh   |  1276 +
 .../src/include/fls_gen/unpack/unpack.cuh     |  3451 ++
 .../src/include/fls_gen/unpack/unpack.hpp     |    13 +
 .../include/fls_gen/unpack/unpack_fused.cuh   |  3461 ++
 .../src/include/fls_gen/unrsum/unrsum.hpp     |    58 +
 fastlanes/src/include/gpu_utils.h             |    93 +
 fastlanes/src/include/query/query_2.hpp       |    37 +
 fastlanes/src/include/query/query_21.hpp      |   295 +
 fastlanes/src/include/query/query_3.hpp       |    44 +
 fastlanes/src/include/query/query_31.hpp      |   165 +
 fastlanes/src/include/query/query_4.hpp       |    37 +
 fastlanes/src/include/query/query_41.hpp      |    50 +
 fastlanes/src/include/ssb_utils.h             |   248 +
 fastlanes/src/include/util.cuh                |    92 +
 fastlanes/src/pack.cpp                        | 29910 ++++++++++++++++
 fastlanes/src/ssb/READMe.md                   |    33 +
 fastlanes/src/ssb/compress_ssb.cu             |   530 +
 fastlanes/src/ssb/compress_ssb_sorted.cu      |   533 +
 fastlanes/src/ssb/fls_q11.cu                  |   241 +
 fastlanes/src/ssb/fls_q11_bitpacked_opt_v2.cu |   233 +
 fastlanes/src/ssb/fls_q11_bitpacked_opt_v3.cu |   207 +
 fastlanes/src/ssb/fls_q11_bitpacked_opt_v4.cu |   194 +
 fastlanes/src/ssb/fls_q11_bp_crystal_opt.cu   |   264 +
 fastlanes/src/ssb/fls_q21.cu                  |   482 +
 fastlanes/src/ssb/fls_q21_bitpacked_opt_v4.cu |   394 +
 fastlanes/src/ssb/fls_q31.cu                  |   626 +
 fastlanes/src/ssb/fls_q31_bitpacked_opt_v5.cu |   393 +
 fastlanes/src/ssb/fls_q41.cu                  |   603 +
 fastlanes/src/ssb/fls_q41_bitpacked_opt_v3.cu |   531 +
 fastlanes/src/ssb/fls_q41_bitpacked_opt_v4.cu |   538 +
 fastlanes/src/test_g.cu                       |   209 +
 fastlanes/src/tmp/fls_q41_bitpacked_opt_v2.cu |   485 +
 fastlanes/src/transpose.cpp                   |  8215 +++++
 fastlanes/src/unrsum.cpp                      |   523 +
 include/cub/test/CMakeLists.txt               |   367 +
 include/cub/test/README.md                    |   125 +
 include/cub/test/bfloat16.h                   |   249 +
 include/cub/test/c2h/custom_type.cuh          |   200 +
 include/cub/test/c2h/generators.cu            |   417 +
 include/cub/test/c2h/generators.cuh           |   103 +
 include/cub/test/catch2_runner.cu             |     3 +
 .../catch2_test_block_adjacent_difference.cu  |   425 +
 .../cub/test/catch2_test_block_histogram.cu   |   216 +
 include/cub/test/catch2_test_block_load.cu    |   326 +
 .../cub/test/catch2_test_block_merge_sort.cu  |   520 +
 .../cub/test/catch2_test_block_radix_sort.cu  |   394 +
 .../cub/test/catch2_test_block_radix_sort.cuh |   457 +
 .../catch2_test_block_radix_sort_custom.cu    |  1060 +
 include/cub/test/catch2_test_block_reduce.cu  |   362 +
 .../catch2_test_block_run_length_decode.cu    |   638 +
 include/cub/test/catch2_test_block_scan.cu    |   536 +
 include/cub/test/catch2_test_block_shuffle.cu |   427 +
 include/cub/test/catch2_test_block_store.cu   |   327 +
 include/cub/test/catch2_test_cdp_helper.h     |   170 +
 include/cub/test/catch2_test_cdp_wrapper.cu   |   229 +
 .../catch2_test_device_decoupled_look_back.cu |   168 +
 .../catch2_test_device_radix_sort_custom.cu   |  1693 +
 include/cub/test/catch2_test_helper.h         |   206 +
 include/cub/test/catch2_test_printing.cu      |    36 +
 .../cub/test/catch2_test_radix_operations.cu  |   686 +
 include/cub/test/catch2_test_util_type.cu     |    70 +
 include/cub/test/catch2_test_warp_exchange.cu |   354 +
 include/cub/test/catch2_test_warp_load.cu     |   387 +
 include/cub/test/catch2_test_warp_mask.cu     |   108 +
 .../cub/test/catch2_test_warp_merge_sort.cu   |   594 +
 include/cub/test/catch2_test_warp_reduce.cu   |   608 +
 include/cub/test/catch2_test_warp_scan.cu     |   689 +
 include/cub/test/catch2_test_warp_store.cu    |   314 +
 include/cub/test/cmake/CMakeLists.txt         |    24 +
 .../cub/test/cmake/check_source_files.cmake   |   178 +
 .../test/cmake/test_install/CMakeLists.txt    |    93 +
 include/cub/test/fill_striped.cuh             |   163 +
 include/cub/test/half.h                       |   345 +
 include/cub/test/link_a.cu                    |    11 +
 include/cub/test/link_b.cu                    |    11 +
 include/cub/test/link_main.cpp                |    10 +
 include/cub/test/mersenne.h                   |   162 +
 include/cub/test/test_allocator.cu            |   452 +
 include/cub/test/test_block_radix_rank.cu     |   343 +
 include/cub/test/test_cdp_variant_state.cu    |    34 +
 .../test/test_device_adjacent_difference.cu   |   701 +
 include/cub/test/test_device_batch_copy.cu    |   523 +
 include/cub/test/test_device_batch_memcpy.cu  |   733 +
 include/cub/test/test_device_histogram.cu     |  1684 +
 include/cub/test/test_device_merge_sort.cu    |   362 +
 include/cub/test/test_device_radix_sort.cu    |  2251 ++
 include/cub/test/test_device_reduce.cu        |  1916 +
 include/cub/test/test_device_reduce_by_key.cu |   747 +
 .../cub/test/test_device_run_length_encode.cu |   839 +
 include/cub/test/test_device_scan.cu          |  1275 +
 include/cub/test/test_device_scan_by_key.cu   |  1099 +
 .../cub/test/test_device_segmented_sort.cu    |  1946 +
 include/cub/test/test_device_select_if.cu     |  1118 +
 include/cub/test/test_device_select_unique.cu |   616 +
 .../test/test_device_select_unique_by_key.cu  |   631 +
 include/cub/test/test_device_spmv.cu          |   594 +
 .../test/test_device_three_way_partition.cu   |   594 +
 include/cub/test/test_grid_barrier.cu         |   152 +
 include/cub/test/test_iterator.cu             |   544 +
 include/cub/test/test_iterator_deprecated.cu  |   306 +
 include/cub/test/test_namespace_wrapped.cu    |    76 +
 .../cub/test/test_temporary_storage_layout.cu |   219 +
 include/cub/test/test_thread_operators.cu     |   259 +
 include/cub/test/test_thread_sort.cu          |   150 +
 include/cub/test/test_util.h                  |  1655 +
 include/cub/test/test_util_vec.h              |   320 +
 .../T4/crystal-fls/crystal_fls_q11_sf10.txt   |   185 +
 results/T4/crystal-fls/crystal_fls_q21.txt    |  1037 +
 .../T4/crystal-fls/crystal_fls_q21_sf10.txt   |  1034 +
 .../T4/crystal-fls/crystal_fls_q31_sf10.txt   |   939 +
 .../T4/crystal-fls/crystal_fls_q41_sf10.txt   |   976 +
 results/T4/crystal-opt/crystal_opt_q21.txt    |   984 +
 .../T4/crystal-opt/crystal_opt_q21_sf10.txt   |  1008 +
 .../T4/crystal-opt/crystal_opt_q31_sf10.txt   |   913 +
 .../T4/crystal-opt/crystal_opt_q41_sf10.txt   |   952 +
 results/T4/crystal/crystal_q21.txt            |  1008 +
 results/T4/crystal/crystal_q31_sf10.txt       |   906 +
 results/T4/crystal/crystal_q41_sf10.txt       |   951 +
 .../crystal-fls/crystal_fls_q21_sf10_2.txt    |  1037 +
 .../V100/crystal-fls/crystal_fls_q31_sf10.txt |   955 +
 .../V100/crystal-fls/crystal_fls_q41_sf10.txt |   983 +
 .../crystal_opt_fls_q11_sf10.txt              |   178 +
 .../V100/crystal-opt/crystal_opt_q11_sf10.txt |   186 +
 .../V100/crystal-opt/crystal_opt_q21_sf10.txt |  1045 +
 .../V100/crystal-opt/crystal_opt_q31_sf10.txt |   926 +
 .../crystal-opt/crystal_opt_q41_sf10_v100.txt |   987 +
 results/V100/crystal/crystal_q21_sf10.txt     |  1056 +
 results/V100/crystal/crystal_q31_sf10.txt     |   920 +
 .../V100/crystal/crystal_q41_sf10_v100.txt    |   971 +
 scripts/ssb_on_duckdb.py                      |     9 +
 scripts/ssb_on_duckdb/__init__.py             |    33 +
 scripts/ssb_on_duckdb/load.py                 |    93 +
 scripts/ssb_on_duckdb/query_11.py             |    26 +
 scripts/ssb_on_duckdb/query_12.py             |    13 +
 scripts/ssb_on_duckdb/query_13.py             |    15 +
 scripts/ssb_on_duckdb/query_21.py             |    15 +
 tile_based/CMakeLists.txt                     |     1 +
 tile_based/README.md                          |     1 +
 tile_based/src/CMakeLists.txt                 |    43 +
 tile_based/src/config.hpp                     |   213 +
 tile_based/src/include/binpack_kernel.cuh     |    84 +
 tile_based/src/include/crystal/crystal.cuh    |     9 +
 tile_based/src/include/crystal/join.cuh       |   311 +
 tile_based/src/include/crystal/load.cuh       |    97 +
 tile_based/src/include/crystal/pred.cuh       |   335 +
 tile_based/src/include/crystal/reduce.cuh     |    45 +
 tile_based/src/include/crystal/store.cuh      |    98 +
 .../src/include/deltabinpack_kernel.cuh       |   103 +
 tile_based/src/include/econfig.h              |     8 +
 tile_based/src/include/kernel.cuh             |     5 +
 tile_based/src/include/rlebinpack_kernel.cuh  |   146 +
 tile_based/src/include/ssb_gpu_utils.h        |    67 +
 tile_based/src/include/ssb_utils.h            |   239 +
 tile_based/src/include/utils/gpu_utils.h      |    17 +
 tile_based/src/rlebinpack.cpp                 |   235 +
 tile_based/src/rlebinpack_kernel.cuh          |   146 +
 tile_based/src/test_match_rle.cu              |   137 +
 tile_based/src/test_perf_rle.cu               |   126 +
 tile_based/src/tile_based.cu                  |     3 +
 tile_based/src/tile_based_bench_bitpack.cu    |   182 +
 tile_based/src/tile_based_bench_bp_sum.cu     |   225 +
 tile_based/src/tile_based_bench_delta.cu      |   254 +
 tile_based/src/tile_based_bench_delta_sum.cu  |   270 +
 tile_based/src/tile_based_bench_rle.cu        |   138 +
 .../src/tile_based_bench_rle_all_memory.cu    |   327 +
 tile_based/src/tile_based_binpack_query_11.cu |   281 +
 .../src/tile_based_bitpack_shared_memory.cu   |   256 +
 tool/CMakeLists.txt                           |     1 +
 tool/device_query.cu                          |    87 +
 toolchains/T4.cmake                           |     7 +
 toolchains/gtx1080.cmake                      |     7 +
 391 files changed, 162236 insertions(+)
 create mode 100644 .github/workflows/CI.yaml
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 crystal-opt/CMakeLists.txt
 create mode 100644 crystal-opt/README.md
 create mode 100644 crystal-opt/src/CMakeLists.txt
 create mode 100644 crystal-opt/src/crystal/crystal.cuh
 create mode 100644 crystal-opt/src/crystal/join.cuh
 create mode 100644 crystal-opt/src/crystal/load.cuh
 create mode 100644 crystal-opt/src/crystal/pred.cuh
 create mode 100644 crystal-opt/src/crystal/reduce.cuh
 create mode 100644 crystal-opt/src/crystal/store.cuh
 create mode 100644 crystal-opt/src/crystal/term.cuh
 create mode 100644 crystal-opt/src/ops/join.cu
 create mode 100644 crystal-opt/src/ops/project.cu
 create mode 100644 crystal-opt/src/ops/utils/generator.h
 create mode 100644 crystal-opt/src/ops/utils/gpu_utils.h
 create mode 100644 crystal-opt/src/ssb/all.cu
 create mode 100644 crystal-opt/src/ssb/gpu_utils.h
 create mode 100644 crystal-opt/src/ssb/q11.cu
 create mode 100644 crystal-opt/src/ssb/q12.cu
 create mode 100644 crystal-opt/src/ssb/q13.cu
 create mode 100644 crystal-opt/src/ssb/q21.cu
 create mode 100644 crystal-opt/src/ssb/q22.cu
 create mode 100644 crystal-opt/src/ssb/q23.cu
 create mode 100644 crystal-opt/src/ssb/q31.cu
 create mode 100644 crystal-opt/src/ssb/q32.cu
 create mode 100644 crystal-opt/src/ssb/q33.cu
 create mode 100644 crystal-opt/src/ssb/q34.cu
 create mode 100644 crystal-opt/src/ssb/q41.cu
 create mode 100644 crystal-opt/src/ssb/q42.cu
 create mode 100644 crystal-opt/src/ssb/q43.cu
 create mode 100644 crystal-opt/src/ssb/ssb_utils.h
 create mode 100644 crystal/CMakeLists.txt
 create mode 100644 crystal/LICENSE
 create mode 100644 crystal/README.md
 create mode 100644 crystal/src/CMakeLists.txt
 create mode 100644 crystal/src/crystal/crystal.cuh
 create mode 100644 crystal/src/crystal/join.cuh
 create mode 100644 crystal/src/crystal/load.cuh
 create mode 100644 crystal/src/crystal/pred.cuh
 create mode 100644 crystal/src/crystal/reduce.cuh
 create mode 100644 crystal/src/crystal/store.cuh
 create mode 100644 crystal/src/ops/join.cu
 create mode 100644 crystal/src/ops/project.cu
 create mode 100644 crystal/src/ops/utils/generator.h
 create mode 100644 crystal/src/ops/utils/gpu_utils.h
 create mode 100644 crystal/src/ssb/gpu_utils.h
 create mode 100644 crystal/src/ssb/q11.cu
 create mode 100644 crystal/src/ssb/q12.cu
 create mode 100644 crystal/src/ssb/q13.cu
 create mode 100644 crystal/src/ssb/q21.cu
 create mode 100644 crystal/src/ssb/q22.cu
 create mode 100644 crystal/src/ssb/q23.cu
 create mode 100644 crystal/src/ssb/q31.cu
 create mode 100644 crystal/src/ssb/q32.cu
 create mode 100644 crystal/src/ssb/q33.cu
 create mode 100644 crystal/src/ssb/q34.cu
 create mode 100644 crystal/src/ssb/q41.cu
 create mode 100644 crystal/src/ssb/q42.cu
 create mode 100644 crystal/src/ssb/q43.cu
 create mode 100644 crystal/src/ssb/ssb_utils.h
 create mode 100644 data/README.md
 create mode 100644 data/result_of_queries/q11
 create mode 100644 data/result_of_queries/q21
 create mode 100644 data/result_of_queries/q31
 create mode 100644 data/result_of_queries/q41
 create mode 100644 data/ssb/.gitignore
 create mode 100644 data/ssb/SSB.md
 create mode 100644 data/ssb/dbgen/.gitignore
 create mode 100644 data/ssb/dbgen/BUGS
 create mode 100644 data/ssb/dbgen/CHANGES
 create mode 100644 data/ssb/dbgen/HISTORY
 create mode 100644 data/ssb/dbgen/PORTING.NOTES
 create mode 100644 data/ssb/dbgen/README
 create mode 100644 data/ssb/dbgen/TPCH_README
 create mode 100644 data/ssb/dbgen/bcd2.c
 create mode 100644 data/ssb/dbgen/bcd2.h
 create mode 100644 data/ssb/dbgen/bcd2.o
 create mode 100644 data/ssb/dbgen/bm_utils.c
 create mode 100644 data/ssb/dbgen/bm_utils.o
 create mode 100644 data/ssb/dbgen/build.c
 create mode 100644 data/ssb/dbgen/build.o
 create mode 100644 data/ssb/dbgen/config.h
 create mode 100644 data/ssb/dbgen/dists.dss
 create mode 100644 data/ssb/dbgen/driver.c
 create mode 100644 data/ssb/dbgen/driver.o
 create mode 100644 data/ssb/dbgen/dss.ddl
 create mode 100644 data/ssb/dbgen/dss.h
 create mode 100644 data/ssb/dbgen/dss.ri
 create mode 100644 data/ssb/dbgen/dsstypes.h
 create mode 100644 data/ssb/dbgen/history.html
 create mode 100644 data/ssb/dbgen/load_stub.c
 create mode 100644 data/ssb/dbgen/load_stub.o
 create mode 100644 data/ssb/dbgen/makefile
 create mode 100644 data/ssb/dbgen/makefile.suite
 create mode 100644 data/ssb/dbgen/makefile_win
 create mode 100644 data/ssb/dbgen/permute.c
 create mode 100644 data/ssb/dbgen/permute.h
 create mode 100644 data/ssb/dbgen/permute.o
 create mode 100644 data/ssb/dbgen/print.c
 create mode 100644 data/ssb/dbgen/print.o
 create mode 100755 data/ssb/dbgen/qgen
 create mode 100644 data/ssb/dbgen/qgen.c
 create mode 100644 data/ssb/dbgen/qgen.o
 create mode 100644 data/ssb/dbgen/rnd.c
 create mode 100644 data/ssb/dbgen/rnd.h
 create mode 100644 data/ssb/dbgen/rnd.o
 create mode 100644 data/ssb/dbgen/shared.h
 create mode 100644 data/ssb/dbgen/speed_seed.c
 create mode 100644 data/ssb/dbgen/speed_seed.o
 create mode 100644 data/ssb/dbgen/tags
 create mode 100644 data/ssb/dbgen/text.c
 create mode 100644 data/ssb/dbgen/text.o
 create mode 100644 data/ssb/dbgen/tpcd.h
 create mode 100644 data/ssb/dbgen/varsub.c
 create mode 100644 data/ssb/dbgen/varsub.o
 create mode 100644 data/ssb/loader/.metadata
 create mode 100644 data/ssb/loader/Makefile
 create mode 100644 data/ssb/loader/columnSort.c
 create mode 100644 data/ssb/loader/convert.py
 create mode 100644 data/ssb/loader/convert_old.py
 create mode 100644 data/ssb/loader/dict.c
 create mode 100644 data/ssb/loader/include/common.h
 create mode 100644 data/ssb/loader/include/schema.h
 create mode 100644 data/ssb/loader/load.c
 create mode 100644 data/ssb/loader/load_modified.c
 create mode 100644 data/ssb/loader/rle.c
 create mode 100644 data/ssb/loader/soa.c
 create mode 100644 data/ssb/loader/sort.py
 create mode 100644 data/ssb/loader/sort_other_way.py
 create mode 100644 data/ssb/queries/original/load.sql
 create mode 100644 data/ssb/queries/original/q11.sql
 create mode 100644 data/ssb/queries/original/q12.sql
 create mode 100644 data/ssb/queries/original/q13.sql
 create mode 100644 data/ssb/queries/original/q21.sql
 create mode 100644 data/ssb/queries/original/q22.sql
 create mode 100644 data/ssb/queries/original/q23.sql
 create mode 100644 data/ssb/queries/original/q31.sql
 create mode 100644 data/ssb/queries/original/q32.sql
 create mode 100644 data/ssb/queries/original/q33.sql
 create mode 100644 data/ssb/queries/original/q34.sql
 create mode 100644 data/ssb/queries/original/q41.sql
 create mode 100644 data/ssb/queries/original/q42.sql
 create mode 100644 data/ssb/queries/original/q43.sql
 create mode 100644 data/ssb/queries/original/schema.sql
 create mode 100644 data/ssb/queries/transformed/load.sql
 create mode 100644 data/ssb/queries/transformed/p1.sql
 create mode 100644 data/ssb/queries/transformed/q11.sql
 create mode 100644 data/ssb/queries/transformed/q12.sql
 create mode 100644 data/ssb/queries/transformed/q13.sql
 create mode 100644 data/ssb/queries/transformed/q21.sql
 create mode 100644 data/ssb/queries/transformed/q22.sql
 create mode 100644 data/ssb/queries/transformed/q23.sql
 create mode 100644 data/ssb/queries/transformed/q31.sql
 create mode 100644 data/ssb/queries/transformed/q32.sql
 create mode 100644 data/ssb/queries/transformed/q33.sql
 create mode 100644 data/ssb/queries/transformed/q34.sql
 create mode 100644 data/ssb/queries/transformed/q41.sql
 create mode 100644 data/ssb/queries/transformed/q42.sql
 create mode 100644 data/ssb/queries/transformed/q43.sql
 create mode 100644 data/ssb/queries/transformed/schema.sql
 create mode 100644 data/ssb/queries/transformed/schema_no_pk.sql
 create mode 100755 data/util.py
 create mode 100644 fastlanes/CMakeLists.txt
 create mode 100644 fastlanes/example/CMakeLists.txt
 create mode 100644 fastlanes/example/fastlanes_bench_bitpack.cu
 create mode 100644 fastlanes/example/fastlanes_bench_delta.cu
 create mode 100644 fastlanes/generate.py
 create mode 100644 fastlanes/generated/CMakeLists.txt
 create mode 100644 fastlanes/generated/cuda/CMakeLists.txt
 create mode 100644 fastlanes/generated/cuda/fused_t32_uf1/CMakeLists.txt
 create mode 100644 fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_bench.cu
 create mode 100644 fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_helper.hpp
 create mode 100644 fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_src.cu
 create mode 100644 fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_test.cu
 create mode 100644 fastlanes/generated/cuda/fused_t32_uf1/unpack.cmake
 create mode 100644 fastlanes/generated/cuda/normal_t32_uf1/CMakeLists.txt
 create mode 100644 fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_bench.cu
 create mode 100644 fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_helper.hpp
 create mode 100644 fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_src.cu
 create mode 100644 fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_test.cu
 create mode 100644 fastlanes/generated/cuda/normal_t32_uf1/unpack.cmake
 create mode 100644 fastlanes/generated/generated_files.txt
 create mode 100644 fastlanes/generated_files.txt
 create mode 100644 fastlanes/src/CMakeLists.txt
 create mode 100644 fastlanes/src/bitpack_register.cu
 create mode 100644 fastlanes/src/bitpack_shared_memory.cu
 create mode 100644 fastlanes/src/delta_global_memory.cu
 create mode 100644 fastlanes/src/delta_shared_memory.cu
 create mode 100644 fastlanes/src/fastlanes_gpu.cpp
 create mode 100644 fastlanes/src/include/common.cuh
 create mode 100644 fastlanes/src/include/crystal-opt/crystal.cuh
 create mode 100644 fastlanes/src/include/crystal-opt/join.cuh
 create mode 100644 fastlanes/src/include/crystal-opt/load.cuh
 create mode 100644 fastlanes/src/include/crystal-opt/pred.cuh
 create mode 100644 fastlanes/src/include/crystal-opt/reduce.cuh
 create mode 100644 fastlanes/src/include/crystal-opt/store.cuh
 create mode 100644 fastlanes/src/include/crystal-opt/term.cuh
 create mode 100644 fastlanes/src/include/crystal/crystal.cuh
 create mode 100644 fastlanes/src/include/crystal/join.cuh
 create mode 100644 fastlanes/src/include/crystal/load.cuh
 create mode 100644 fastlanes/src/include/crystal/pred.cuh
 create mode 100644 fastlanes/src/include/crystal/reduce.cuh
 create mode 100644 fastlanes/src/include/crystal/store.cuh
 create mode 100644 fastlanes/src/include/crystal/term.cuh
 create mode 100644 fastlanes/src/include/crystal_ssb_utils.h
 create mode 100644 fastlanes/src/include/debug.cuh
 create mode 100644 fastlanes/src/include/debug.hpp
 create mode 100644 fastlanes/src/include/error.cuh
 create mode 100644 fastlanes/src/include/fastlanes.cuh
 create mode 100644 fastlanes/src/include/fastlanes/join.cuh
 create mode 100644 fastlanes/src/include/fastlanes/pred.cuh
 create mode 100644 fastlanes/src/include/fls_gen/macros.hpp
 create mode 100644 fastlanes/src/include/fls_gen/pack/pack.hpp
 create mode 100644 fastlanes/src/include/fls_gen/rle/rle.hpp
 create mode 100644 fastlanes/src/include/fls_gen/rsum/rsum.cuh
 create mode 100644 fastlanes/src/include/fls_gen/transpose/transpose.hpp
 create mode 100644 fastlanes/src/include/fls_gen/unpack/hardcoded_16.cuh
 create mode 100644 fastlanes/src/include/fls_gen/unpack/unpack.cuh
 create mode 100644 fastlanes/src/include/fls_gen/unpack/unpack.hpp
 create mode 100644 fastlanes/src/include/fls_gen/unpack/unpack_fused.cuh
 create mode 100644 fastlanes/src/include/fls_gen/unrsum/unrsum.hpp
 create mode 100644 fastlanes/src/include/gpu_utils.h
 create mode 100644 fastlanes/src/include/query/query_2.hpp
 create mode 100644 fastlanes/src/include/query/query_21.hpp
 create mode 100644 fastlanes/src/include/query/query_3.hpp
 create mode 100644 fastlanes/src/include/query/query_31.hpp
 create mode 100644 fastlanes/src/include/query/query_4.hpp
 create mode 100644 fastlanes/src/include/query/query_41.hpp
 create mode 100644 fastlanes/src/include/ssb_utils.h
 create mode 100644 fastlanes/src/include/util.cuh
 create mode 100644 fastlanes/src/pack.cpp
 create mode 100644 fastlanes/src/ssb/READMe.md
 create mode 100644 fastlanes/src/ssb/compress_ssb.cu
 create mode 100644 fastlanes/src/ssb/compress_ssb_sorted.cu
 create mode 100644 fastlanes/src/ssb/fls_q11.cu
 create mode 100644 fastlanes/src/ssb/fls_q11_bitpacked_opt_v2.cu
 create mode 100644 fastlanes/src/ssb/fls_q11_bitpacked_opt_v3.cu
 create mode 100644 fastlanes/src/ssb/fls_q11_bitpacked_opt_v4.cu
 create mode 100644 fastlanes/src/ssb/fls_q11_bp_crystal_opt.cu
 create mode 100644 fastlanes/src/ssb/fls_q21.cu
 create mode 100644 fastlanes/src/ssb/fls_q21_bitpacked_opt_v4.cu
 create mode 100644 fastlanes/src/ssb/fls_q31.cu
 create mode 100644 fastlanes/src/ssb/fls_q31_bitpacked_opt_v5.cu
 create mode 100644 fastlanes/src/ssb/fls_q41.cu
 create mode 100644 fastlanes/src/ssb/fls_q41_bitpacked_opt_v3.cu
 create mode 100644 fastlanes/src/ssb/fls_q41_bitpacked_opt_v4.cu
 create mode 100644 fastlanes/src/test_g.cu
 create mode 100644 fastlanes/src/tmp/fls_q41_bitpacked_opt_v2.cu
 create mode 100644 fastlanes/src/transpose.cpp
 create mode 100644 fastlanes/src/unrsum.cpp
 create mode 100644 include/cub/test/CMakeLists.txt
 create mode 100644 include/cub/test/README.md
 create mode 100644 include/cub/test/bfloat16.h
 create mode 100644 include/cub/test/c2h/custom_type.cuh
 create mode 100644 include/cub/test/c2h/generators.cu
 create mode 100644 include/cub/test/c2h/generators.cuh
 create mode 100644 include/cub/test/catch2_runner.cu
 create mode 100644 include/cub/test/catch2_test_block_adjacent_difference.cu
 create mode 100644 include/cub/test/catch2_test_block_histogram.cu
 create mode 100644 include/cub/test/catch2_test_block_load.cu
 create mode 100644 include/cub/test/catch2_test_block_merge_sort.cu
 create mode 100644 include/cub/test/catch2_test_block_radix_sort.cu
 create mode 100644 include/cub/test/catch2_test_block_radix_sort.cuh
 create mode 100644 include/cub/test/catch2_test_block_radix_sort_custom.cu
 create mode 100644 include/cub/test/catch2_test_block_reduce.cu
 create mode 100644 include/cub/test/catch2_test_block_run_length_decode.cu
 create mode 100644 include/cub/test/catch2_test_block_scan.cu
 create mode 100644 include/cub/test/catch2_test_block_shuffle.cu
 create mode 100644 include/cub/test/catch2_test_block_store.cu
 create mode 100644 include/cub/test/catch2_test_cdp_helper.h
 create mode 100644 include/cub/test/catch2_test_cdp_wrapper.cu
 create mode 100644 include/cub/test/catch2_test_device_decoupled_look_back.cu
 create mode 100644 include/cub/test/catch2_test_device_radix_sort_custom.cu
 create mode 100644 include/cub/test/catch2_test_helper.h
 create mode 100644 include/cub/test/catch2_test_printing.cu
 create mode 100644 include/cub/test/catch2_test_radix_operations.cu
 create mode 100644 include/cub/test/catch2_test_util_type.cu
 create mode 100644 include/cub/test/catch2_test_warp_exchange.cu
 create mode 100644 include/cub/test/catch2_test_warp_load.cu
 create mode 100644 include/cub/test/catch2_test_warp_mask.cu
 create mode 100644 include/cub/test/catch2_test_warp_merge_sort.cu
 create mode 100644 include/cub/test/catch2_test_warp_reduce.cu
 create mode 100644 include/cub/test/catch2_test_warp_scan.cu
 create mode 100644 include/cub/test/catch2_test_warp_store.cu
 create mode 100644 include/cub/test/cmake/CMakeLists.txt
 create mode 100644 include/cub/test/cmake/check_source_files.cmake
 create mode 100644 include/cub/test/cmake/test_install/CMakeLists.txt
 create mode 100644 include/cub/test/fill_striped.cuh
 create mode 100644 include/cub/test/half.h
 create mode 100644 include/cub/test/link_a.cu
 create mode 100644 include/cub/test/link_b.cu
 create mode 100644 include/cub/test/link_main.cpp
 create mode 100644 include/cub/test/mersenne.h
 create mode 100644 include/cub/test/test_allocator.cu
 create mode 100644 include/cub/test/test_block_radix_rank.cu
 create mode 100644 include/cub/test/test_cdp_variant_state.cu
 create mode 100644 include/cub/test/test_device_adjacent_difference.cu
 create mode 100644 include/cub/test/test_device_batch_copy.cu
 create mode 100644 include/cub/test/test_device_batch_memcpy.cu
 create mode 100644 include/cub/test/test_device_histogram.cu
 create mode 100644 include/cub/test/test_device_merge_sort.cu
 create mode 100644 include/cub/test/test_device_radix_sort.cu
 create mode 100644 include/cub/test/test_device_reduce.cu
 create mode 100644 include/cub/test/test_device_reduce_by_key.cu
 create mode 100644 include/cub/test/test_device_run_length_encode.cu
 create mode 100644 include/cub/test/test_device_scan.cu
 create mode 100644 include/cub/test/test_device_scan_by_key.cu
 create mode 100644 include/cub/test/test_device_segmented_sort.cu
 create mode 100644 include/cub/test/test_device_select_if.cu
 create mode 100644 include/cub/test/test_device_select_unique.cu
 create mode 100644 include/cub/test/test_device_select_unique_by_key.cu
 create mode 100644 include/cub/test/test_device_spmv.cu
 create mode 100644 include/cub/test/test_device_three_way_partition.cu
 create mode 100644 include/cub/test/test_grid_barrier.cu
 create mode 100644 include/cub/test/test_iterator.cu
 create mode 100644 include/cub/test/test_iterator_deprecated.cu
 create mode 100644 include/cub/test/test_namespace_wrapped.cu
 create mode 100644 include/cub/test/test_temporary_storage_layout.cu
 create mode 100644 include/cub/test/test_thread_operators.cu
 create mode 100644 include/cub/test/test_thread_sort.cu
 create mode 100644 include/cub/test/test_util.h
 create mode 100644 include/cub/test/test_util_vec.h
 create mode 100644 results/T4/crystal-fls/crystal_fls_q11_sf10.txt
 create mode 100644 results/T4/crystal-fls/crystal_fls_q21.txt
 create mode 100644 results/T4/crystal-fls/crystal_fls_q21_sf10.txt
 create mode 100644 results/T4/crystal-fls/crystal_fls_q31_sf10.txt
 create mode 100644 results/T4/crystal-fls/crystal_fls_q41_sf10.txt
 create mode 100644 results/T4/crystal-opt/crystal_opt_q21.txt
 create mode 100644 results/T4/crystal-opt/crystal_opt_q21_sf10.txt
 create mode 100644 results/T4/crystal-opt/crystal_opt_q31_sf10.txt
 create mode 100644 results/T4/crystal-opt/crystal_opt_q41_sf10.txt
 create mode 100644 results/T4/crystal/crystal_q21.txt
 create mode 100644 results/T4/crystal/crystal_q31_sf10.txt
 create mode 100644 results/T4/crystal/crystal_q41_sf10.txt
 create mode 100644 results/V100/crystal-fls/crystal_fls_q21_sf10_2.txt
 create mode 100644 results/V100/crystal-fls/crystal_fls_q31_sf10.txt
 create mode 100644 results/V100/crystal-fls/crystal_fls_q41_sf10.txt
 create mode 100644 results/V100/crystal-opt-fls/crystal_opt_fls_q11_sf10.txt
 create mode 100644 results/V100/crystal-opt/crystal_opt_q11_sf10.txt
 create mode 100644 results/V100/crystal-opt/crystal_opt_q21_sf10.txt
 create mode 100644 results/V100/crystal-opt/crystal_opt_q31_sf10.txt
 create mode 100644 results/V100/crystal-opt/crystal_opt_q41_sf10_v100.txt
 create mode 100644 results/V100/crystal/crystal_q21_sf10.txt
 create mode 100644 results/V100/crystal/crystal_q31_sf10.txt
 create mode 100644 results/V100/crystal/crystal_q41_sf10_v100.txt
 create mode 100644 scripts/ssb_on_duckdb.py
 create mode 100644 scripts/ssb_on_duckdb/__init__.py
 create mode 100644 scripts/ssb_on_duckdb/load.py
 create mode 100644 scripts/ssb_on_duckdb/query_11.py
 create mode 100644 scripts/ssb_on_duckdb/query_12.py
 create mode 100644 scripts/ssb_on_duckdb/query_13.py
 create mode 100644 scripts/ssb_on_duckdb/query_21.py
 create mode 100644 tile_based/CMakeLists.txt
 create mode 100644 tile_based/README.md
 create mode 100644 tile_based/src/CMakeLists.txt
 create mode 100644 tile_based/src/config.hpp
 create mode 100644 tile_based/src/include/binpack_kernel.cuh
 create mode 100644 tile_based/src/include/crystal/crystal.cuh
 create mode 100644 tile_based/src/include/crystal/join.cuh
 create mode 100644 tile_based/src/include/crystal/load.cuh
 create mode 100644 tile_based/src/include/crystal/pred.cuh
 create mode 100644 tile_based/src/include/crystal/reduce.cuh
 create mode 100644 tile_based/src/include/crystal/store.cuh
 create mode 100644 tile_based/src/include/deltabinpack_kernel.cuh
 create mode 100644 tile_based/src/include/econfig.h
 create mode 100644 tile_based/src/include/kernel.cuh
 create mode 100644 tile_based/src/include/rlebinpack_kernel.cuh
 create mode 100644 tile_based/src/include/ssb_gpu_utils.h
 create mode 100644 tile_based/src/include/ssb_utils.h
 create mode 100644 tile_based/src/include/utils/gpu_utils.h
 create mode 100644 tile_based/src/rlebinpack.cpp
 create mode 100644 tile_based/src/rlebinpack_kernel.cuh
 create mode 100644 tile_based/src/test_match_rle.cu
 create mode 100644 tile_based/src/test_perf_rle.cu
 create mode 100644 tile_based/src/tile_based.cu
 create mode 100644 tile_based/src/tile_based_bench_bitpack.cu
 create mode 100644 tile_based/src/tile_based_bench_bp_sum.cu
 create mode 100644 tile_based/src/tile_based_bench_delta.cu
 create mode 100644 tile_based/src/tile_based_bench_delta_sum.cu
 create mode 100644 tile_based/src/tile_based_bench_rle.cu
 create mode 100644 tile_based/src/tile_based_bench_rle_all_memory.cu
 create mode 100644 tile_based/src/tile_based_binpack_query_11.cu
 create mode 100644 tile_based/src/tile_based_bitpack_shared_memory.cu
 create mode 100644 tool/CMakeLists.txt
 create mode 100644 tool/device_query.cu
 create mode 100644 toolchains/T4.cmake
 create mode 100644 toolchains/gtx1080.cmake

diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml
new file mode 100644
index 0000000..aaa7359
--- /dev/null
+++ b/.github/workflows/CI.yaml
@@ -0,0 +1,165 @@
+name: CI
+run-name: ${{ github.actor }} is building
+
+on: push
+
+jobs:
+  # https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters
+  GPU:
+    if: github.actor == 'azimafroozeh'
+    strategy:
+      fail-fast: true
+      matrix:
+        platform: [ T4, V100 ]
+        BUILD_TYPE: [ Release ]
+        cxx: [ clang++ ]
+    runs-on: ${{ matrix.platform }}
+
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+
+      - name: Make directory build
+        run: mkdir ${{github.workspace}}/build
+
+      - name: Configure CMake
+        run: cmake -DFLS_BUILD_GPU=ON -DCMAKE_TOOLCHAIN_FILE=${{github.workspace}}/toolchains/T4.cmake -S ${{github.workspace}} -B ${{github.workspace}}/build
+        env:
+          CXX: ${{ matrix.cxx }}
+
+      - name: Build
+        run: cmake --build ${{github.workspace}}/build -j 8
+
+      - name: Report compression average bit width
+        run: |
+          echo "NOT Sorted"          
+          ${{github.workspace}}/build/fastlanes/src/compress_ssb
+          echo "Sorted"                    
+          ${{github.workspace}}/build/fastlanes/src/compress_ssb_sorted
+
+      - name: FLS-GPU-opt q21 V3
+        run: |
+          ${{github.workspace}}/build/fastlanes/src/fls_q21_bitpacked_opt_v4
+          echo "SORTED + FOR ON ORDERDATE"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q21_bitpacked_opt_v4 | grep Duration
+
+      - name: FLS-GPU-opt q31 V5
+        run: |
+          ${{github.workspace}}/build/fastlanes/src/fls_q31_bitpacked_opt_v5
+          echo "SORTED + FOR ON ORDERDATE"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q31_bitpacked_opt_v5 | grep Duration
+
+      - name: FLS-GPU-opt q41 V3 V4
+        run: |
+          ${{github.workspace}}/build/fastlanes/src/fls_q41_bitpacked_opt_v3
+          echo "SORTED + FOR ON ORDERDATE"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q41_bitpacked_opt_v3 | grep Duration
+          
+          ${{github.workspace}}/build/fastlanes/src/fls_q41_bitpacked_opt_v4
+          echo "SORTED + FOR ON ORDERDATE and CUSTKEY"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q41_bitpacked_opt_v4 | grep Duration
+
+      - name: Test FLS + Crystal
+        run: |
+          ${{github.workspace}}/build/fastlanes/src/fls_q11 1
+          echo "-- fls_q11 version 1 Passed!"
+          ${{github.workspace}}/build/fastlanes/src/fls_q11 2
+          echo "-- fls_q11 version 2 Passed!"
+          
+          ${{github.workspace}}/build/fastlanes/src/fls_q21 1
+          echo "-- fls_q21 version 1 Passed!"
+          ${{github.workspace}}/build/fastlanes/src/fls_q21 2
+          echo "-- fls_q21 version 2 Passed!"
+          ${{github.workspace}}/build/fastlanes/src/fls_q21 3
+          echo "-- fls_q21 version 3 Passed!"
+          
+          ${{github.workspace}}/build/fastlanes/src/fls_q31 1
+          echo "-- fls_q31 version 1 Passed!"
+          ${{github.workspace}}/build/fastlanes/src/fls_q31 2
+          echo "-- fls_q31 version 2 Passed!"
+          ${{github.workspace}}/build/fastlanes/src/fls_q31 3
+          echo "-- fls_q31 version 3 Passed!"
+          ${{github.workspace}}/build/fastlanes/src/fls_q31 4
+          echo "-- fls_q31 version 4 Passed!"
+          
+          ${{github.workspace}}/build/fastlanes/src/fls_q41 1
+          echo "-- fls_q41 version 1 Passed!"
+          ${{github.workspace}}/build/fastlanes/src/fls_q41 2
+          echo "-- fls_q41 version 2 Passed!"
+      #          ${{github.workspace}}/build/fastlanes/src/fls_q31 3
+      #          echo "-- fls_q31 version 3 Passed!"
+      #          ${{github.workspace}}/build/fastlanes/src/fls_q31 4
+      #          echo "-- fls_q31 version 4 Passed!"
+      #          ${{github.workspace}}/build/fastlanes/src/fls_q31 4
+      #          echo "-- fls_q31 version 4 Passed!"
+
+      - name: NCU FLS + Crystal
+        run: |
+          echo "FLS Q11 version 1 : FastLanes-GPU"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q11 1 | grep Duration
+          echo "FLS Q11 version 2 : FLS-GPU-opt"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q11 2 | grep Duration
+          
+          echo "FLS Q21 version 1 : FastLanes-GPU"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q21 1 | grep Duration
+          echo "FLS Q21 version 2 : FLS-GPU-opt"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q21 2 | grep Duration
+          echo "FLS Q21 version 3 : FLS-GPU-opt + predicate load on uncompressed data"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q21 3 | grep Duration
+          
+          echo "FLS Q31 version 1 : FastLanes-GPU"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q31 1 | grep Duration
+          echo "FLS Q31 version 2 : Version 1 >> combination of shared + register"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q31 2 | grep Duration
+          echo "FLS Q31 version 3 : 8 value at a time"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q31 3 | grep Duration
+          echo "FLS Q31 version 4 : v3 + predicate load on uncompressed data"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q31 4 | grep Duration
+          
+          echo "FLS Q41 version 1 : FastLanes-GPU"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q41 1 | grep Duration
+          echo "FLS Q41 version 2 : 8 value at a time + predicate load on uncompressed data"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q41 2 | grep Duration
+      #          echo "FLS Q31 version 3 : 8 value at a time"
+      #          ncu ${{github.workspace}}/build/fastlanes/src/fls_q31 3 | grep Duration
+      #          echo "FLS Q31 version 4 : v3 + predicate load on uncompressed data"
+      #          ncu ${{github.workspace}}/build/fastlanes/src/fls_q31 4 | grep Duration
+
+      - name: NCU crystal
+        run: |
+          echo "-- crystal q11"
+          ncu ${{github.workspace}}/build/crystal/src/crystal_q11 | grep Duration
+          echo "-- crystal q21"
+          ncu ${{github.workspace}}/build/crystal/src/crystal_q21 | grep Duration
+          echo "-- crystal q31"
+          ncu ${{github.workspace}}/build/crystal/src/crystal_q31 | grep Duration
+          echo "-- crystal q41"
+          ncu ${{github.workspace}}/build/crystal/src/crystal_q41 | grep Duration
+
+      - name: NCU crystal OPT
+        run: |
+          echo "-- crystal-opt q11"
+          ncu ${{github.workspace}}/build/crystal-opt/src/crystal_opt_q11 | grep Duration
+          echo "-- crystal-opt q21"
+          ncu ${{github.workspace}}/build/crystal-opt/src/crystal_opt_q21 | grep Duration
+          echo "-- crystal-opt q31"
+          ncu ${{github.workspace}}/build/crystal-opt/src/crystal_opt_q31 | grep Duration
+          echo "-- crystal-opt q41"
+          ncu ${{github.workspace}}/build/crystal-opt/src/crystal_opt_q41 | grep Duration
+
+      - name: FLS-GPU-opt q11 v2
+        run: echo "Simdized TODO"
+        #          ${{github.workspace}}/build/fastlanes/src/fls_q11_bitpacked_opt_v2
+        #          ncu ${{github.workspace}}/build/fastlanes/src/fls_q11_bitpacked_opt_v2 | grep Duration
+
+      - name: FLS-GPU-opt q11 v3
+        run: |
+          ${{github.workspace}}/build/fastlanes/src/fls_q11_bitpacked_opt_v3
+          echo "v3 : Multiple check"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q11_bitpacked_opt_v3 | grep Duration
+
+      - name: FLS-GPU-opt q11 v4
+        run: |
+          ${{github.workspace}}/build/fastlanes/src/fls_q11_bitpacked_opt_v4
+          echo "v1 with 8 value at a time **not complete yet**"
+          ncu ${{github.workspace}}/build/fastlanes/src/fls_q11_bitpacked_opt_v4 | grep Duration
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b982d23
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+*.i
+*.ii
+*.gpu
+*.ptx
+*.cubin
+*.fatbin
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..1752528
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 3.22)
+project(FastLanesGPU)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+
+# Requirements : -------------------------------------------------------------------------------------------------------
+if (NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    message(FATAL_ERROR "Only Clang is supported!")
+endif ()
+if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13)
+    message(FATAL_ERROR "Only Clang >= 13 is supported!")
+endif ()
+
+# FLAGS : --------------------------------------------------------------------------------------------------------------
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror")
+if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "x86")
+    #    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")  # FSST
+endif ()
+
+#-----------------------------------------------------------------------------------------------------------------------
+include(FetchContent)
+include(CheckCXXCompilerFlag)
+include(CMakePrintHelpers)
+# https://stackoverflow.com/questions/56089330/cmake-creates-lots-of-targets-i-didnt-specify
+set_property(GLOBAL PROPERTY CTEST_TARGETS_ADDED 1)
+include(CTest)
+
+# GTEST : ------------------------------------------------------------------------------------------------------------
+message("---------------------------------------------------------------------------------------------------------")
+message("- Building GTEST:")
+include(GoogleTest)
+# Gtest: -----------------------------------------------------------------------------------------------------------
+FetchContent_Declare(googletest
+        GIT_REPOSITORY https://github.com/google/googletest.git
+        GIT_TAG e2239ee6043f73722e7aa812a459f54a28552929 # release-1.11.0
+)
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+enable_testing()
+
+# Silence clang-tidy warnings from googletest
+set_target_properties(gtest PROPERTIES CXX_CLANG_TIDY "")
+set_target_properties(gtest_main PROPERTIES CXX_CLANG_TIDY "")
+set_target_properties(gmock PROPERTIES CXX_CLANG_TIDY "")
+set_target_properties(gmock_main PROPERTIES CXX_CLANG_TIDY "")
+
+# Definitions: ---------------------------------------------------------------------------------------------------------
+add_compile_definitions(CMAKE_SOURCE_DIR="${CMAKE_SOURCE_DIR}")
+
+
+if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
+    message("There is no CUDA on Darwin")
+    RETURN()
+endif ()
+
+set(CMAKE_CUDA_ARCHITECTURES "native")
+enable_language(CUDA)
+set(CMAKE_CUDA_STANDARD 20)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+message("---------------------------------------------------------------------------------------------------------")
+message("-- CUDA:")
+cmake_print_variables(CUDA_INCLUDE_DIRS)
+cmake_print_variables(CUDA_LIBRARIES)
+cmake_print_variables(CUDA_FOUND)
+cmake_print_variables(CMAKE_CUDA_FLAGS)
+cmake_print_variables(CMAKE_CUDA_FLAGS_DEBUG)
+cmake_print_variables(CMAKE_CUDA_FLAGS_RELEASE)
+cmake_print_variables(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
+cmake_print_variables(CMAKE_CUDA_COMPILER)
+cmake_print_variables(CMAKE_CUDA_COMPILER_VERSION)
+cmake_print_variables(CMAKE_CUDA_STANDARD)
+cmake_print_variables(CMAKE_CUDA_STANDARD_REQUIRED)
+cmake_print_variables(CMAKE_CXX_STANDARD)
+cmake_print_variables(CMAKE_CXX_COMPILER)
+cmake_print_variables(CMAKE_CXX_COMPILER_VERSION)
+cmake_print_variables(CMAKE_CXX_COMPILER_ID)
+cmake_print_variables(CMAKE_SOURCE_DIR)
+cmake_print_variables(CMAKE_BUILD_TYPE)
+
+# CUDA : ---------------------------------------------------------------------------------------------------------------
+# http://knottsgroup.groups.et.byu.net/labbook/index.php?n=Main.CompilingLAMMPSForGPU
+#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}
+#    -gencode=arch=compute_75,code=sm_75
+#    -gencode=arch=compute_75,code=compute_75"
+#)
+
+# Include : ------------------------------------------------------------------------------------------------------------
+include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+include_directories(include)
+
+# Tools : --------------------------------------------------------------------------------------------------------------
+add_subdirectory(tool)
+
+# Crystal : ------------------------------------------------------------------------------------------------------------
+add_subdirectory(crystal)
+
+# FastLanes : ----------------------------------------------------------------------------------------------------------
+add_subdirectory(fastlanes)
+
+#TileBased : ----------------------------------------------------------------------------------------------------------
+add_subdirectory(tile_based)
+# try https://github.com/azimafroozeh/gpu-compression
+
+# Crystal-Opt : --------------------------------------------------------------------------------------------------------
+add_subdirectory(crystal-opt)
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..2b375e1
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Azim Afroozeh, Lotte Felius, CWI Database Architectures Group
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..46d0b9f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,40 @@
+# FastLanesGPU: Accelerating GPU Data Processing using FastLanes Compression
+
+FastLanesGPU details can be found in the [publication](https://doi.org/10.1145/3662010.3663450).
+
+## Contents
+
+- [FastLanesGPU in a Nutshell](#fastlanesgpu-in-a-nutshell)
+- [Quickstart](#quickstart)
+- [Building and Running](#building-and-running)
+- [FastLanesGPU Primitives](#fastlanesgpu-primitives)
+- [Replicating Paper Experiments](#replicating-paper-experiments)
+    - [Build](#build)
+    - [Setup Data](#setup-data)
+    - [Speed Tests](#speed-tests)
+
+## FastLanesGPU in a Nutshell
+
+## Quickstart
+
+## Building and Running
+
+Requirements:
+
+1) __Clang++__ 13 or higher
+2) __CMake__ 3.22 or higher
+
+## FastLanesGPU Primitives
+
+## Replicating Paper Experiments
+
+### Build
+
+```shell
+cmake  .
+make
+```
+
+### Setup Data
+
+
diff --git a/crystal-opt/CMakeLists.txt b/crystal-opt/CMakeLists.txt
new file mode 100644
index 0000000..2721300
--- /dev/null
+++ b/crystal-opt/CMakeLists.txt
@@ -0,0 +1,3 @@
+# Source : -------------------------------------------------------------------------------------------------------------
+add_subdirectory(src)
+
diff --git a/crystal-opt/README.md b/crystal-opt/README.md
new file mode 100644
index 0000000..5a6f8ac
--- /dev/null
+++ b/crystal-opt/README.md
@@ -0,0 +1,29 @@
+Crystal-Opt GPU Library
+=================
+
+The Crystal-Opt library makes additional changes to the original Crystal library for better performance. The original Crystal library implements a collection of block-wide device functions that can be used to implement high performance implementations of SQL queries on GPUs.
+
+You can also refer to the original Crystal library and their papers [here](https://github.com/anilshanbhag/crystal).
+
+Usage
+----
+
+```
+# Generate the test data and transform into columnar layout
+# Substitute <SF> with appropriate scale factor (eg: 1)
+python util.py ssb <SF> gen
+python util.py ssb <SF> transform
+```
+
+* Configure the benchmark settings
+```
+cd src/ssb/
+# Edit SF and BASE_PATH in ssb_utils.h
+```
+
+* To run a query, say run q11
+```
+make bin/ssb/q11
+./bin/ssb/q11
+```
+
diff --git a/crystal-opt/src/CMakeLists.txt b/crystal-opt/src/CMakeLists.txt
new file mode 100644
index 0000000..7d05391
--- /dev/null
+++ b/crystal-opt/src/CMakeLists.txt
@@ -0,0 +1,43 @@
+add_library(crystal_opt STATIC ops/join.cu ops/project.cu)
+target_include_directories(crystal_opt PUBLIC ops)
+target_include_directories(crystal_opt PUBLIC ssb)
+target_include_directories(crystal_opt PUBLIC crystal)
+
+add_executable(crystal_opt_q11 ssb/q11.cu)
+target_link_libraries(crystal_opt_q11 crystal_opt)
+
+add_executable(crystal_opt_q12 ssb/q12.cu)
+target_link_libraries(crystal_opt_q12 crystal_opt)
+
+add_executable(crystal_opt_q13 ssb/q13.cu)
+target_link_libraries(crystal_opt_q13 crystal_opt)
+
+add_executable(crystal_opt_q21 ssb/q21.cu)
+target_link_libraries(crystal_opt_q21 crystal_opt)
+
+add_executable(crystal_opt_q22 ssb/q22.cu)
+target_link_libraries(crystal_opt_q22 crystal_opt)
+
+add_executable(crystal_opt_q23 ssb/q23.cu)
+target_link_libraries(crystal_opt_q23 crystal_opt)
+
+add_executable(crystal_opt_q31 ssb/q31.cu)
+target_link_libraries(crystal_opt_q31 crystal_opt)
+
+add_executable(crystal_opt_q32 ssb/q32.cu)
+target_link_libraries(crystal_opt_q32 crystal_opt)
+
+add_executable(crystal_opt_q33 ssb/q33.cu)
+target_link_libraries(crystal_opt_q33 crystal_opt)
+
+add_executable(crystal_opt_q34 ssb/q34.cu)
+target_link_libraries(crystal_opt_q34 crystal_opt)
+
+add_executable(crystal_opt_q41 ssb/q41.cu)
+target_link_libraries(crystal_opt_q41 crystal_opt)
+
+add_executable(crystal_opt_q42 ssb/q42.cu)
+target_link_libraries(crystal_opt_q42 crystal_opt)
+
+add_executable(crystal_opt_q43 ssb/q43.cu)
+target_link_libraries(crystal_opt_q43 crystal_opt)
\ No newline at end of file
diff --git a/crystal-opt/src/crystal/crystal.cuh b/crystal-opt/src/crystal/crystal.cuh
new file mode 100644
index 0000000..8246b3b
--- /dev/null
+++ b/crystal-opt/src/crystal/crystal.cuh
@@ -0,0 +1,32 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+// Block-wide functions
+#include "load.cuh"
+#include "pred.cuh"
+#include "store.cuh"
+#include "reduce.cuh"
+#include "join.cuh"
+#include "term.cuh"
+
diff --git a/crystal-opt/src/crystal/join.cuh b/crystal-opt/src/crystal/join.cuh
new file mode 100644
index 0000000..d3734fa
--- /dev/null
+++ b/crystal-opt/src/crystal/join.cuh
@@ -0,0 +1,333 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#define HASH(X,Y,Z) ((X-Z) % Y)
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(items[ITEM], ht_len, keys_min);
+
+      K slot = ht[hash];
+      if (slot != 0) {
+        selection_flags[ITEM] = 1;
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K slot = ht[hash];
+        if (slot != 0) {
+          selection_flags[ITEM] = 1;
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+      if (slot != 0) {
+        res[ITEM] = (slot >> 32);
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+        if (slot != 0) {
+          res[ITEM] = (slot >> 32);
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash], 0, keys[ITEM]);
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash], 0, items[ITEM]);
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+      ht[(hash << 1) + 1] = res[ITEM];
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+        ht[(hash << 1) + 1] = res[ITEM];
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
diff --git a/crystal-opt/src/crystal/load.cuh b/crystal-opt/src/crystal/load.cuh
new file mode 100644
index 0000000..54b903c
--- /dev/null
+++ b/crystal-opt/src/crystal/load.cuh
@@ -0,0 +1,166 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoadDirect(const unsigned int tid, T *block_itr,
+                    T (&items)[ITEMS_PER_THREAD],
+                    int (&selection_flags)[ITEMS_PER_THREAD]) {
+  T *thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoadDirect(const unsigned int tid, T *block_itr,
+                    T (&items)[ITEMS_PER_THREAD], int num_items,
+                    int (&selection_flags)[ITEMS_PER_THREAD]) {
+  T *thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+      }
+    }
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoad(T *inp, T (&items)[ITEMS_PER_THREAD], int num_items,
+              int (&selection_flags)[ITEMS_PER_THREAD]) {
+  T *block_itr = inp;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, block_itr, items, selection_flags);
+  } else {
+    BlockPredLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, block_itr, items, num_items, selection_flags);
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = inp;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
diff --git a/crystal-opt/src/crystal/pred.cuh b/crystal-opt/src/crystal/pred.cuh
new file mode 100644
index 0000000..7d38325
--- /dev/null
+++ b/crystal-opt/src/crystal/pred.cuh
@@ -0,0 +1,357 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void InitFlags(
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = 1;
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPred(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAnd(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOr(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T>
+struct LessThan
+{
+  T compare;
+
+  __device__ __forceinline__
+  LessThan(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a < compare);
+  }
+};
+
+template<typename T>
+struct GreaterThan
+{
+  T compare;
+
+  __device__ __forceinline__
+  GreaterThan(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a > compare);
+  }
+};
+
+template<typename T>
+struct LessThanEq
+{
+  T compare;
+
+  __device__ __forceinline__
+  LessThanEq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a <= compare);
+  }
+};
+
+template<typename T>
+struct GreaterThanEq
+{
+  T compare;
+
+  __device__ __forceinline__
+  GreaterThanEq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a >= compare);
+  }
+};
+
+template<typename T>
+struct Eq
+{
+  T compare;
+
+  __device__ __forceinline__
+  Eq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a == compare);
+  }
+};
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThan<T> select_op(compare);
+  BlockPred<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThan<T> select_op(compare);
+  BlockPredAnd<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredGT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThan<T> select_op(compare);
+  BlockPred<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndGT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThan<T> select_op(compare);
+  BlockPredAnd<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThanEq<T> select_op(compare);
+  BlockPred<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThanEq<T> select_op(compare);
+  BlockPredAnd<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredGTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThanEq<T> select_op(compare);
+  BlockPred<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndGTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThanEq<T> select_op(compare);
+  BlockPredAnd<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPred<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPredAnd<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPredOr<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
diff --git a/crystal-opt/src/crystal/reduce.cuh b/crystal-opt/src/crystal/reduce.cuh
new file mode 100644
index 0000000..1f08282
--- /dev/null
+++ b/crystal-opt/src/crystal/reduce.cuh
@@ -0,0 +1,75 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(
+    T  item,
+    T* shared
+    ) {
+  __syncthreads();
+
+  T val = item;
+  const int warp_size = 32;
+  int lane = threadIdx.x % warp_size;
+  int wid = threadIdx.x / warp_size;
+
+  // Calculate sum across warp
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(0xffffffff, val, offset);
+  }
+
+  // Store sum in buffer
+  if (lane == 0) {
+    shared[wid] = val;
+  }
+
+  __syncthreads();
+
+  // Load the sums into the first warp
+  val = (threadIdx.x < blockDim.x / warp_size) ? shared[lane] : 0;
+
+  // Calculate sum of sums
+  if (wid == 0) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+      val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+  }
+
+  return val;
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(
+    T (&items)[ITEMS_PER_THREAD],
+    T* shared
+    ) {
+  T thread_sum = 0;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    thread_sum += items[ITEM];
+  }
+
+  return BlockSum(thread_sum, shared);
+}
diff --git a/crystal-opt/src/crystal/store.cuh b/crystal-opt/src/crystal/store.cuh
new file mode 100644
index 0000000..a5de94f
--- /dev/null
+++ b/crystal-opt/src/crystal/store.cuh
@@ -0,0 +1,120 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* out,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = out;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
+
diff --git a/crystal-opt/src/crystal/term.cuh b/crystal-opt/src/crystal/term.cuh
new file mode 100644
index 0000000..1e3a5fc
--- /dev/null
+++ b/crystal-opt/src/crystal/term.cuh
@@ -0,0 +1,33 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ bool
+IsTerm(int (&selection_flags)[ITEMS_PER_THREAD]) {
+    int count = 0;
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+        count += selection_flags[ITEM];
+    }
+    return count == 0;
+}
diff --git a/crystal-opt/src/ops/join.cu b/crystal-opt/src/ops/join.cu
new file mode 100644
index 0000000..e7da88f
--- /dev/null
+++ b/crystal-opt/src/ops/join.cu
@@ -0,0 +1,242 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "utils/generator.h"
+#include "utils/gpu_utils.h"
+
+using namespace std;
+
+#define DEBUG 1
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_kernel(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_kernel(int *fact_fkey, int *fact_val, int num_tuples, 
+    int *hash_table, int num_slots, unsigned long long *res) {
+  // Load a tile striped across threads
+  int selection_flags[ITEMS_PER_THREAD];
+  int keys[ITEMS_PER_THREAD];
+  int vals[ITEMS_PER_THREAD];
+  int join_vals[ITEMS_PER_THREAD];
+
+  unsigned long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples+ TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(fact_fkey + tile_offset, keys, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(fact_val + tile_offset, vals, num_tile_items);
+
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, join_vals, selection_flags,
+      hash_table, num_slots, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += vals[ITEM] * join_vals[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(res, aggregate);
+  }
+}
+
+struct TimeKeeper {
+  float time_build;
+  float time_probe;
+  float time_extra;
+  float time_total;
+};
+
+TimeKeeper hashJoin(int* d_dim_key, int* d_dim_val, int* d_fact_fkey, int* d_fact_val, int num_dim, int num_fact, cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  int* hash_table = NULL;
+  unsigned long long* res;
+  int num_slots = num_dim;
+  float time_build, time_probe, time_memset, time_memset2;
+
+  ALLOCATE(hash_table, sizeof(int) * 2 * num_dim);
+  ALLOCATE(res, sizeof(long long));
+
+  TIME_FUNC(cudaMemset(hash_table, 0, num_slots * sizeof(int) * 2), time_memset);
+  TIME_FUNC(cudaMemset(res, 0, sizeof(long long)), time_memset2);
+
+  int tile_items = 128*4;
+
+  TIME_FUNC((build_kernel<128, 4><<<(num_dim + tile_items - 1)/tile_items, 128>>>(d_dim_key, d_dim_val, num_dim, hash_table, num_slots)), time_build);
+  TIME_FUNC((probe_kernel<128, 4><<<(num_fact + tile_items - 1)/tile_items, 128>>>(d_fact_fkey, d_fact_val, num_fact, hash_table, num_slots, res)), time_probe);
+
+#if DEBUG
+  cout << "{" << "\"time_memset\":" << time_memset
+      << ",\"time_build\"" << time_build
+      << ",\"time_probe\":" << time_probe << "}" << endl;
+#endif
+
+  CLEANUP(hash_table);
+  CLEANUP(res);
+
+  TimeKeeper t = {time_build, time_probe, time_memset, time_build + time_probe + time_memset};
+  return t;
+}
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+#define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec))
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+int main(int argc, char** argv)
+{
+  int num_fact           = 256 * 1<<20;
+  int num_dim            = 16 * 1<<20;
+  int num_trials         = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("n", num_fact);
+  args.GetCmdLineArgument("d", num_dim);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+        "[--n=<num fact>] "
+        "[--d=<num dim>] "
+        "[--t=<num trials>] "
+        "[--device=<device-id>] "
+        "[--v] "
+        "\n", argv[0]);
+    exit(0);
+  }
+
+  int log2 = 0;
+  int num_dim_dup = num_dim >> 1;
+  while (num_dim_dup) {
+    num_dim_dup >>= 1;
+    log2 += 1;
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate problem device arrays
+  int *d_dim_key = NULL;
+  int *d_dim_val = NULL;
+  int *d_fact_fkey = NULL;
+  int *d_fact_val = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dim_key, sizeof(int) * num_dim));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dim_val, sizeof(int) * num_dim));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_fact_fkey, sizeof(int) * num_fact));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_fact_val, sizeof(int) * num_fact));
+
+  int *h_dim_key = NULL;
+  int *h_dim_val = NULL;
+  int *h_fact_fkey = NULL;
+  int *h_fact_val = NULL;
+
+  create_relation_pk(h_dim_key, h_dim_val, num_dim);
+  create_relation_fk(h_fact_fkey, h_fact_val, num_fact, num_dim);
+
+  CubDebugExit(cudaMemcpy(d_dim_key, h_dim_key, sizeof(int) * num_dim, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_dim_val, h_dim_val, sizeof(int) * num_dim, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_fact_fkey, h_fact_fkey, sizeof(int) * num_fact, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_fact_val, h_fact_val, sizeof(int) * num_fact, cudaMemcpyHostToDevice));
+
+  for (int j = 0; j < num_trials; j++) {
+    TimeKeeper t = hashJoin(d_dim_key, d_dim_val, d_fact_fkey, d_fact_val, num_dim, num_fact, g_allocator);
+    cout<< "{"
+        << "\"num_dim\":" << num_dim
+        << ",\"num_fact\":" << num_fact
+        << ",\"radix\":" << 0
+        << ",\"time_partition_build\":" << 0
+        << ",\"time_partition_probe\":" << 0
+        << ",\"time_partition_total\":" << 0
+        << ",\"time_build\":" << t.time_build
+        << ",\"time_probe\":" << t.time_probe
+        << ",\"time_extra\":" << t.time_extra
+        << ",\"time_join_total\":" << t.time_total
+        << "}" << endl;
+  }
+
+  CLEANUP(d_dim_key);
+  CLEANUP(d_dim_val);
+  CLEANUP(d_fact_fkey);
+  CLEANUP(d_fact_val);
+
+  return 0;
+}
+
diff --git a/crystal-opt/src/ops/project.cu b/crystal-opt/src/ops/project.cu
new file mode 100644
index 0000000..9e44bcd
--- /dev/null
+++ b/crystal-opt/src/ops/project.cu
@@ -0,0 +1,198 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+#include <cmath>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "utils/gpu_utils.h"
+
+using namespace std;
+
+
+//---------------------------------------------------------------------
+// Implements Projection Operator
+// There are two variants: dot-product and sigmoid
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void project(float* in1, float* in2, float* out, int num_items)
+{
+  float items[ITEMS_PER_THREAD];
+  float items2[ITEMS_PER_THREAD];
+  float res[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_items + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_items - tile_offset;
+  }
+
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in1 + tile_offset, items, num_tile_items);
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in2 + tile_offset, items2, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
+      res[ITEM] = 2*items[ITEM] + 3*items2[ITEM];
+    }
+  }
+
+  BlockStore<float, BLOCK_THREADS, ITEMS_PER_THREAD>(out + tile_offset, res, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void projectSigmoid(float* in1, float* in2, float* out, int num_items)
+{
+  float items[ITEMS_PER_THREAD];
+  float items2[ITEMS_PER_THREAD];
+  float res[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_items + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_items - tile_offset;
+  }
+
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in1 + tile_offset, items, num_tile_items);
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in2 + tile_offset, items2, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
+      res[ITEM] = 1.0f / (1.0f + expf(-2*items[ITEM] -3*items2[ITEM]));
+    }
+  }
+
+  BlockStore<float, BLOCK_THREADS, ITEMS_PER_THREAD>(out + tile_offset, res, num_tile_items);
+}
+
+
+float projectGPU(float* in1, float* in2, float* out, int num_items) {
+  SETUP_TIMING();
+
+  float time_proj;
+  int tile_items = 128*4;
+  int num_blocks = (num_items + tile_items - 1)/tile_items;
+  TIME_FUNC((project<128,4><<<num_blocks, 128>>>(in1, in2, out, num_items)), time_proj);
+
+  return time_proj;
+}
+
+float projectSigmoidGPU(float* in1, float* in2, float* out, int num_items) {
+  SETUP_TIMING();
+
+  float time_proj;
+  int tile_items = 128*4;
+  int num_blocks = (num_items + tile_items - 1)/tile_items;
+  TIME_FUNC((projectSigmoid<128,4><<<num_blocks, 128>>>(in1, in2, out, num_items)), time_proj);
+
+  return time_proj;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items           = 1<<28;
+  int num_trials          = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--n=<input items>] "
+          "[--t=<num trials>] "
+          "[--device=<device-id>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate problem device arrays
+  float *d_in1 = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in1, sizeof(float) * num_items));
+
+  float *d_in2 = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in2, sizeof(float) * num_items));
+
+  float  *d_out = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(float) * num_items));
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  curandGenerator_t generator;
+  int seed = 0;
+  curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
+  curandSetPseudoRandomGeneratorSeed(generator,seed);
+  curandGenerateUniform(generator, d_in1, num_items);
+  curandGenerateUniform(generator, d_in2, num_items);
+
+  float time_proj_gpu;
+  float time_proj_sigmoid_gpu;  
+
+  for (int t = 0; t < num_trials; t++) {
+    time_proj_gpu = projectGPU(d_in1, d_in2, d_out, num_items);
+    time_proj_sigmoid_gpu = projectSigmoidGPU(d_in1, d_in2, d_out, num_items);
+
+    cout<< "{"
+        << "\"time_proj_gpu\":" << time_proj_gpu
+        << ",\"time_proj_sigmoid_gpu\":" << time_proj_sigmoid_gpu
+        << "}" << endl;
+  }
+
+  // Cleanup
+  if (d_in1) CubDebugExit(g_allocator.DeviceFree(d_in1));
+  if (d_in2) CubDebugExit(g_allocator.DeviceFree(d_in2));
+  if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+
+  return 0;
+}
+
diff --git a/crystal-opt/src/ops/utils/generator.h b/crystal-opt/src/ops/utils/generator.h
new file mode 100644
index 0000000..c69c141
--- /dev/null
+++ b/crystal-opt/src/ops/utils/generator.h
@@ -0,0 +1,399 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <iostream>
+#include <cstdio>              /* perror */
+#include <cstdlib>             /* posix_memalign */
+#include <immintrin.h>
+#include <thread>
+using namespace std;
+
+#define RAND_RANGE(N) ((double)rand() / ((double)RAND_MAX + 1) * (N))
+#define RANDR_RANGE(N) ((double)rand_r(&seed) / ((double)RAND_MAX + 1) * (N))
+static int seeded = 0;
+
+/** Check wheter seeded, if not seed the generator with current time */
+static void
+check_seed()
+{
+    if(!seeded) {
+        srand(0);
+        seeded = 1;
+    }
+}
+
+/**
+ * Shuffle tuples of the relation using Knuth shuffle.
+ *
+ * @param relation
+ */
+void
+knuth_shuffle(int* arr, int num_tuples)
+{
+    int i;
+    for (i = num_tuples - 1; i > 0; i--) {
+        int  j              = RAND_RANGE(i);
+        int tmp             = arr[i];
+        arr[i] = arr[j];
+        arr[j] = tmp;
+    }
+}
+
+
+/**
+ * Generate unique tuple IDs with Knuth shuffling
+ * relation must have been allocated
+ */
+void
+random_unique_gen(int*& arr, int num_tuples)
+{
+  int i;
+
+  for (i = 0; i < num_tuples; i++) {
+    arr[i] = (i+1);
+  }
+
+  /* randomly shuffle elements */
+  knuth_shuffle(arr, num_tuples);
+}
+
+void
+dummy_initialize(int*& arr, int num_tuples) {
+    for (int i = 0; i < num_tuples; i++) {
+        arr[i] = i;
+    }
+}
+
+int
+create_relation_pk(int*& keys, int*& vals, int num_tuples)
+{
+  check_seed();
+
+  keys = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+  vals = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+
+  if (!keys || !vals) {
+      perror("out of memory");
+      return -1;
+  }
+
+  random_unique_gen(keys, num_tuples);
+  dummy_initialize(vals, num_tuples);
+
+  return 0;
+}
+
+int create_relation_fk(int*& keys, int*& vals, int num_tuples, const int maxid)
+{
+  int i, iters, remainder;
+
+  check_seed();
+  keys = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+  vals = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+
+  if (!keys || !vals) {
+    perror("out of memory");
+    return -1;
+  }
+
+  // alternative generation method
+  iters = num_tuples / maxid;
+  for (i = 0; i < iters; i++) {
+    int* tuples = keys + maxid * i;
+    random_unique_gen(tuples, maxid);
+  }
+
+  // if num_tuples is not an exact multiple of maxid
+  remainder = num_tuples % maxid;
+  if (remainder > 0) {
+    int* tuples = keys + maxid * iters;
+    random_unique_gen(tuples, remainder);
+  }
+
+  dummy_initialize(vals, num_tuples);
+  return 0;
+}
+
+/*
+typedef struct rand_state_64 {
+  uint64_t num[313];
+  size_t index;
+} rand64_t;
+
+rand64_t *rand64_init(uint64_t seed)
+{
+  rand64_t *state = malloc(sizeof(rand64_t));
+  uint64_t *n = state->num;
+  size_t i;
+  n[0] = seed;
+  for (i = 0 ; i != 311 ; ++i)
+    n[i + 1] = 6364136223846793005ull *
+               (n[i]  (n[i] >> 62)) + i + 1;
+  state->index = 312;
+  return state;
+}
+
+uint64_t rand64_next(rand64_t *state)
+{
+  uint64_t x, *n = state->num;
+  if (state->index == 312) {
+    size_t i = 0;
+    do {
+      x = n[i] & 0xffffffff80000000ull;
+      x |= n[i + 1] & 0x7fffffffull;
+      n[i] = n[i + 156]  (x >> 1);
+      n[i] = 0xb5026f5aa96619e9ull & -(x & 1);
+    } while (++i != 156);
+    n[312] = n[0];
+    do {
+      x = n[i] & 0xffffffff80000000ull;
+      x |= n[i + 1] & 0x7fffffffull;
+      n[i] = n[i - 156]  (x >> 1);
+      n[i] = 0xb5026f5aa96619e9ull & -(x & 1);
+    } while (++i != 312);
+    state->index = 0;
+  }
+  x = n[state->index++];
+  x = (x >> 29) & 0x5555555555555555ull;
+  x = (x << 17) & 0x71d67fffeda60000ull;
+  x = (x << 37) & 0xfff7eee000000000ull;
+  x = (x >> 43);
+  return x;
+}
+
+typedef struct rand_state_32 {
+  uint32_t num[625];
+  size_t index;
+} rand32_t;
+
+rand32_t *rand32_init(uint32_t seed)
+{
+  rand32_t *state = malloc(sizeof(rand32_t));
+  uint32_t *n = state->num;
+  size_t i;
+  n[0] = seed;
+  for (i = 0 ; i != 623 ; ++i)
+    n[i + 1] = 0x6c078965 * (n[i]  (n[i] >> 30));
+  state->index = 624;
+  return state;
+}
+
+uint32_t rand32_next(rand32_t *state)
+{
+  uint32_t y, *n = state->num;
+  if (state->index == 624) {
+    size_t i = 0;
+    do {
+      y = n[i] & 0x80000000;
+      y += n[i + 1] & 0x7fffffff;
+      n[i] = n[i + 397]  (y >> 1);
+      n[i] = 0x9908b0df & -(y & 1);
+    } while (++i != 227);
+    n[624] = n[0];
+    do {
+      y = n[i] & 0x80000000;
+      y += n[i + 1] & 0x7fffffff;
+      n[i] = n[i - 227]  (y >> 1);
+      n[i] = 0x9908b0df & -(y & 1);
+    } while (++i != 624);
+    state->index = 0;
+  }
+  y = n[state->index++];
+  y = (y >> 11);
+  y = (y << 7) & 0x9d2c5680;
+  y = (y << 15) & 0xefc60000;
+  y = (y >> 18);
+  return y;
+}
+
+static int hardware_threads(void)
+{
+  char name[64];
+  struct stat st;
+  int threads = -1;
+  do {
+    sprintf(name, "/sys/devices/system/cpu/cpu%d", ++threads);
+  } while (stat(name, &st) == 0);
+  return threads;
+}
+
+static void *mamalloc(size_t size)
+{
+  void *p = NULL;
+  return posix_memalign(&p, 64, size) ? NULL : p;
+}
+
+typedef struct {
+  pthread_t id;
+  int seed;
+  int thread;
+  int threads;
+  uint32_t hash_factor;
+  uint32_t invalid_key;
+  uint32_t *inner;
+  uint32_t *outer;
+  volatile uint32_t *table;
+  size_t inner_size;
+  size_t outer_size;
+  size_t table_size;
+  size_t join_size;
+  double selectivity;
+  pthread_barrier_t *barrier;
+} info_t;
+
+static void *run(void *arg)
+{
+  info_t *d = (info_t*) arg;
+  assert(pthread_equal(pthread_self(), d->id));
+  int thread = d->thread;
+  int threads = d->threads;
+  uint32_t hash_factor = d->hash_factor;
+  uint32_t invalid_key = d->invalid_key;
+  uint32_t *inner = d->inner;
+  uint32_t *outer = d->outer;
+  volatile uint32_t *table = d->table;
+  size_t i, o, t, h;
+  size_t inner_size = d->inner_size;
+  size_t outer_size = d->outer_size;
+  size_t table_size = d->table_size;
+  size_t inner_beg = (inner_size / threads) *  thread;
+  size_t inner_end = (inner_size / threads) * (thread + 1);
+  size_t outer_beg = (outer_size / threads) *  thread;
+  size_t outer_end = (outer_size / threads) * (thread + 1);
+  size_t table_beg = (table_size / threads) *  thread;
+  size_t table_end = (table_size / threads) * (thread + 1);
+  if (thread + 1 == threads) {
+    inner_end = inner_size;
+    outer_end = outer_size;
+    table_end = table_size;
+  }
+  for (t = table_beg ; t != table_end ; ++t)
+    table[t] = invalid_key;
+  pthread_barrier_wait(&d->barrier[0]);
+  rand32_t *gen = rand32_init(d->seed);
+  for (i = inner_beg ; i != inner_end ; ++i) {
+    int new_key_inserted = 0;
+    uint32_t key;
+    do {
+      do {
+        key = rand32_next(gen);
+      } while (key == invalid_key);
+      h = (uint32_t) (key * hash_factor);
+      h = (h * table_size) >> 32;
+      for (;;) {
+        if (table[h] == invalid_key &&
+            __sync_bool_compare_and_swap(&table[h], invalid_key, key)) {
+            new_key_inserted = 1;
+          break;
+        }
+        if (table[h] == key) break;
+        if (++h == table_size) h = 0;
+      }
+    } while (new_key_inserted == 0);
+    inner[i] = key;
+  }
+  pthread_barrier_wait(&d->barrier[1]);
+  size_t join_size = 0;
+  uint32_t limit = ~0;
+  limit *= d->selectivity;
+  for (o = outer_beg ; o != outer_end ; ++o) {
+    uint32_t key;
+    if (rand32_next(gen) <= limit) {
+      i = rand32_next(gen);
+      i = (i * inner_size) >> 32;
+      key = inner[i];
+      join_size++;
+    } else do {
+      do {
+        key = rand32_next(gen);
+      } while (key == invalid_key);
+      h = (uint32_t) (key * hash_factor);
+      h = (h * table_size) >> 32;
+      while (table[h] != invalid_key) {
+        if (table[h] == key) break;
+        if (++h == table_size) h = 0;
+      }
+    } while (table[h] == key);
+    outer[o] = key;
+  }
+  free(gen);
+  d->join_size = join_size;
+  pthread_exit(NULL);
+}
+
+size_t inner_outer(size_t inner_size, size_t outer_size, double selectivity,
+                   uint32_t **inner_p, uint32_t **outer_p)
+{
+  srand(time(NULL));
+  int t, threads = hardware_threads();
+  // input arguments
+  assert(inner_size <= 1000 * 1000 * 1000);
+  assert(selectivity >= 0.0 && selectivity <= 1.0);
+  // tables
+  uint32_t *inner = mamalloc((inner_size + 1) * sizeof(uint32_t));
+  uint32_t *outer = mamalloc(outer_size * sizeof(uint32_t));
+  size_t table_size = inner_size / 0.7;
+  uint32_t *table = malloc(table_size * sizeof(uint32_t));
+  // constants
+  uint32_t hash_factor = (rand() << 1) | 1;
+  uint32_t invalid_key = rand() * rand();
+  // barriers
+  int b, barriers = 2;
+  pthread_barrier_t barrier[barriers];
+  for (b = 0 ; b != barriers ; ++b)
+    pthread_barrier_init(&barrier[b], NULL, threads);
+  // run threads
+  info_t info[threads];
+  for (t = 0 ; t != threads ; ++t) {
+    info[t].seed = rand();
+    info[t].thread = t;
+    info[t].threads = threads;
+    info[t].hash_factor = hash_factor;
+    info[t].invalid_key = invalid_key;
+    info[t].selectivity = selectivity;
+    info[t].inner = inner;
+    info[t].outer = outer;
+    info[t].table = table;
+    info[t].inner_size = inner_size;
+    info[t].outer_size = outer_size;
+    info[t].table_size = table_size;
+    info[t].barrier = barrier;
+    pthread_create(&info[t].id, NULL, run, (void*) &info[t]);
+  }
+  size_t join_size = 0;
+  for (t = 0 ; t != threads ; ++t) {
+    pthread_join(info[t].id, NULL);
+    join_size += info[t].join_size;
+  }
+  // cleanup
+  for (b = 0 ; b != barriers ; ++b)
+    pthread_barrier_destroy(&barrier[b]);
+  free(table);
+  // pass output
+  inner[inner_size] = invalid_key;
+  *inner_p = inner;
+  *outer_p = outer;
+  return join_size;
+}
+*/
diff --git a/crystal-opt/src/ops/utils/gpu_utils.h b/crystal-opt/src/ops/utils/gpu_utils.h
new file mode 100644
index 0000000..1af7526
--- /dev/null
+++ b/crystal-opt/src/ops/utils/gpu_utils.h
@@ -0,0 +1,57 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#define SETUP_TIMING() cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
+
+#define TIME_FUNC(f,t) { \
+    cudaEventRecord(start, 0); \
+    f; \
+    cudaEventRecord(stop, 0); \
+    cudaEventSynchronize(stop); \
+    cudaEventElapsedTime(&t, start,stop); \
+}
+
+#define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec))
+
+#define ALLOCATE(vec,size) CubDebugExit(g_allocator.DeviceAllocate((void**)&vec, size))
+
+template<typename T>
+T* loadToGPU(T* src, int numEntries, cub::CachingDeviceAllocator& g_allocator) {
+  T* dest;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&dest, sizeof(T) * numEntries));
+  CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * numEntries, cudaMemcpyHostToDevice));
+  return dest;
+}
+
+#define TILE_SIZE (BLOCK_THREADS * ITEMS_PER_THREAD)
+
+#define CHECK_ERROR() { \
+  cudaDeviceSynchronize(); \
+  cudaError_t error = cudaGetLastError(); \
+  if(error != cudaSuccess) \
+  { \
+    printf("CUDA error: %s\n", cudaGetErrorString(error)); \
+    exit(-1); \
+  } \
+}
diff --git a/crystal-opt/src/ssb/all.cu b/crystal-opt/src/ssb/all.cu
new file mode 100644
index 0000000..ab61e05
--- /dev/null
+++ b/crystal-opt/src/ssb/all.cu
@@ -0,0 +1,2734 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <algorithm>
+#include <curand.h>
+#include <iostream>
+#include <random>
+#include <stdio.h>
+#include <vector>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal/crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q11_kernel(int *lo_orderdate, int *lo_discount,
+                           int *lo_quantity, int *lo_extendedprice,
+                           int lo_num_entries, unsigned long long *revenue)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredGT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19930000, selection_flags, num_tile_items);
+  BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940000, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_quantity + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 25, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_discount + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 3, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate =
+      BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum,
+                                                           (long long *)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0)
+  {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+void run_q11(int *lo_orderdate, int *lo_discount, int *lo_quantity,
+             int *lo_extendedprice, int lo_num_entries,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  unsigned long long *d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_sum, sizeof(long long)));
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128 * 4;
+  int num_blocks = (lo_num_entries + tile_items - 1) / tile_items;
+  q11_kernel<128, 4><<<num_blocks, 128>>>(lo_orderdate, lo_discount,
+                                          lo_quantity, lo_extendedprice,
+                                          lo_num_entries, d_sum);
+
+  // Finalize results
+  unsigned long long revenue;
+  CubDebugExit(
+      cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+  CLEANUP(d_sum);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q12_kernel(int *lo_orderdate, int *lo_discount,
+                           int *lo_quantity, int *lo_extendedprice,
+                           int lo_num_entries, unsigned long long *revenue)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940101, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940131, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_quantity + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 26, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 35, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_discount + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 4, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 6, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate =
+      BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum,
+                                                           (long long *)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0)
+  {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+void run_q12(int *lo_orderdate, int *lo_discount, int *lo_quantity,
+             int *lo_extendedprice, int lo_num_entries,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  unsigned long long *d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_sum, sizeof(long long)));
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128 * 4;
+  q12_kernel<128, 4><<<(lo_num_entries + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, lo_num_entries,
+      d_sum);
+
+  // Finalize results
+  unsigned long long revenue;
+  CubDebugExit(
+      cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+  CLEANUP(d_sum);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q13_kernel(int *lo_orderdate, int *lo_discount,
+                           int *lo_quantity, int *lo_extendedprice,
+                           int lo_num_entries, unsigned long long *revenue)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940204, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940210, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_quantity + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 26, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 35, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_discount + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 5, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 7, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate =
+      BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum,
+                                                           (long long *)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0)
+  {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+void run_q13(int *lo_orderdate, int *lo_discount, int *lo_quantity,
+             int *lo_extendedprice, int lo_num_entries,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  unsigned long long *d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_sum, sizeof(long long)));
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128 * 4;
+  q13_kernel<128, 4><<<(lo_num_entries + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, lo_num_entries,
+      d_sum);
+
+  // Finalize results
+  unsigned long long revenue;
+  CubDebugExit(
+      cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+  CLEANUP(d_sum);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q21_kernel4(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+                            int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                            int *ht_p, int p_len, int *ht_d, int d_len,
+                            int *res)
+{
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (brand[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * (5 * 5 * 40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q21_kernel1(int *filter_col, int *dim_key, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q21_kernel2(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q21_kernel3(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+void run_q21(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+             int *lo_revenue, int lo_len, int *p_partkey, int *p_brand1,
+             int *p_category, int p_len, int *d_datekey, int *d_year, int d_len,
+             int *s_suppkey, int *s_region, int s_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * (5 * 5 * 40));
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q21_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  q21_kernel2<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+  q21_kernel3<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q21_kernel4<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_p, p_len, ht_d, d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(res);
+  CLEANUP(ht_d);
+  CLEANUP(ht_p);
+  CLEANUP(ht_s);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q22_kernel4(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+                            int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                            int *ht_p, int p_len, int *ht_d, int d_len,
+                            int *res)
+{
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (brand[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * (5 * 5 * 40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q22_kernel1(int *filter_col, int *dim_key, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q22_kernel2(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 260, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 267, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q22_kernel3(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+void run_q22(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+             int *lo_revenue, int lo_len, int *p_partkey, int *p_brand1,
+             int p_len, int *d_datekey, int *d_year, int d_len, int *s_suppkey,
+             int *s_region, int s_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 1000);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q22_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  q22_kernel2<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_partkey, p_brand1, p_len, ht_p, p_len);
+  q22_kernel3<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q22_kernel4<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_p, p_len, ht_d, d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(res);
+  CLEANUP(ht_d);
+  CLEANUP(ht_p);
+  CLEANUP(ht_s);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q23_kernel4(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+                            int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                            int *ht_p, int p_len, int *ht_d, int d_len,
+                            int *res)
+{
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (brand[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * (5 * 5 * 40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q23_kernel1(int *filter_col, int *dim_key, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q23_kernel2(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 260, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q23_kernel3(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+void run_q23(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+             int *lo_revenue, int lo_len, int *p_partkey, int *p_brand1,
+             int p_len, int *d_datekey, int *d_year, int d_len, int *s_suppkey,
+             int *s_region, int s_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 1000);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q23_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  q23_kernel2<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_partkey, p_brand1, p_len, ht_p, p_len);
+  q23_kernel3<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q23_kernel4<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_p, p_len, ht_d, d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(res);
+  CLEANUP(ht_d);
+  CLEANUP(ht_p);
+  CLEANUP(ht_s);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q31_kernel4(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                            int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                            int *ht_c, int c_len, int *ht_d, int d_len,
+                            int *res)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (s_nation[ITEM] * 25 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 25 * 25);
+        res[hash * 6] = year[ITEM];
+        res[hash * 6 + 1] = c_nation[ITEM];
+        res[hash * 6 + 2] = s_nation[ITEM];
+        /*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 6 + 4]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q31_kernel1(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q31_kernel2(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q31_kernel3(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1992, selection_flags, num_tile_items);
+  BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+void run_q31(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+             int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+             int d_len, int *s_suppkey, int *s_region, int *s_nation, int s_len,
+             int *c_custkey, int *c_region, int *c_nation, int c_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 25 * 25);
+  int res_array_size = res_size * 6;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q31_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+  q31_kernel2<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+  q31_kernel3<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q31_kernel4<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(ht_d);
+  CLEANUP(ht_s);
+  CLEANUP(ht_c);
+  CLEANUP(res);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q32_kernel4(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                            int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                            int *ht_c, int c_len, int *ht_d, int d_len,
+                            int *res)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (s_nation[ITEM] * 250 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q32_kernel1(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q32_kernel2(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q32_kernel3(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1992, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+void run_q32(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+             int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+             int d_len, int *s_suppkey, int *s_nation, int *s_city, int s_len,
+             int *c_custkey, int *c_nation, int *c_city, int c_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q32_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_nation, s_suppkey, s_city, s_len, ht_s, s_len);
+  q32_kernel2<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_nation, c_custkey, c_city, c_len, ht_c, c_len);
+  q32_kernel3<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q32_kernel4<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(ht_d);
+  CLEANUP(ht_s);
+  CLEANUP(ht_c);
+  CLEANUP(res);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q33_kernel4(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                            int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                            int *ht_c, int c_len, int *ht_d, int d_len,
+                            int *res)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (s_nation[ITEM] * 250 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q33_kernel1(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q33_kernel2(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q33_kernel3(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1992, selection_flags, num_tile_items);
+  BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+void run_q33(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+             int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+             int d_len, int *s_suppkey, int *s_city, int s_len, int *c_custkey,
+             int *c_city, int c_len, cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q33_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_suppkey, s_city, s_len, ht_s, s_len);
+  q33_kernel2<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_custkey, c_city, c_len, ht_c, c_len);
+  q33_kernel3<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q33_kernel4<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(ht_d);
+  CLEANUP(ht_s);
+  CLEANUP(ht_c);
+  CLEANUP(res);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q34_kernel4(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                            int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                            int *ht_c, int c_len, int *ht_d, int d_len,
+                            int *res)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (s_nation[ITEM] * 250 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q34_kernel1(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q34_kernel2(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q34_kernel3(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots,
+                            int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 199712, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+void run_q34(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+             int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+             int *d_yearmonthnum, int d_len, int *s_suppkey, int *s_city,
+             int s_len, int *c_custkey, int *c_city, int c_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q34_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_suppkey, s_city, s_len, ht_s, s_len);
+  q34_kernel2<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_custkey, c_city, c_len, ht_c, c_len);
+  q34_kernel3<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_yearmonthnum, d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q34_kernel4<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(ht_d);
+  CLEANUP(ht_c);
+  CLEANUP(ht_s);
+  CLEANUP(res);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q41_kernel5(int *lo_orderdate, int *lo_partkey, int *lo_custkey,
+                            int *lo_suppkey, int *lo_revenue,
+                            int *lo_supplycost, int lo_len, int *ht_p,
+                            int p_len, int *ht_s, int s_len, int *ht_c,
+                            int c_len, int *ht_d, int d_len, int *res)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_supplycost + tile_offset, items, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        int hash = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 25);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        /*atomicAdd(&res[hash * 4 + 2], (1));*/
+        /*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+         * (long long)(1));*/
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q41_kernel1(int *filter_col, int *dim_key, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q41_kernel3(int *filter_col, int *dim_key, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                      num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q41_kernel2(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q41_kernel4(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+void run_q41(int *lo_orderdate, int *lo_custkey, int *lo_partkey,
+             int *lo_suppkey, int *lo_revenue, int *lo_supplycost, int lo_len,
+             int *d_datekey, int *d_year, int d_len, int *p_partkey,
+             int *p_mfgr, int p_len, int *s_suppkey, int *s_region, int s_len,
+             int *c_custkey, int *c_region, int *c_nation, int c_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 25);
+  int ht_entries = 4; // int,int,long long
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q41_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  q41_kernel2<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+  q41_kernel3<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_mfgr, p_partkey, p_len, ht_p, p_len);
+  q41_kernel4<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q41_kernel5<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_custkey, lo_suppkey, lo_revenue,
+      lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d,
+      d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(ht_d);
+  CLEANUP(ht_s);
+  CLEANUP(ht_c);
+  CLEANUP(ht_p);
+  CLEANUP(res);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q42_kernel5(int *lo_orderdate, int *lo_partkey, int *lo_custkey,
+                            int *lo_suppkey, int *lo_revenue,
+                            int *lo_supplycost, int lo_len, int *ht_p,
+                            int p_len, int *ht_s, int s_len, int *ht_c,
+                            int c_len, int *ht_d, int d_len, int *res)
+{
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int category[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, category, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_supplycost + tile_offset, items, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        /*int hash = (category[ITEM] * 7 * 25 + s_nation[ITEM] * 7 +
+         * (year[ITEM] - 1992)) % ((1998-1992+1) * 25 * 55);*/
+        int hash = ((year[ITEM] - 1992) * 25 * 25 + s_nation[ITEM] * 25 +
+                    category[ITEM]) %
+                   ((1998 - 1992 + 1) * 25 * 25);
+        res[hash * 6] = year[ITEM];
+        res[hash * 6 + 1] = s_nation[ITEM];
+        res[hash * 6 + 2] = category[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 6 + 4]),
+                  (long long)(revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q42_kernel2(int *filter_col, int *dim_key, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q42_kernel3(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                      num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q42_kernel1(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q42_kernel4(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1998, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+void run_q42(int *lo_orderdate, int *lo_custkey, int *lo_partkey,
+             int *lo_suppkey, int *lo_revenue, int *lo_supplycost, int lo_len,
+             int *d_datekey, int *d_year, int d_len, int *p_partkey,
+             int *p_mfgr, int *p_category, int p_len, int *s_suppkey,
+             int *s_region, int *s_nation, int s_len, int *c_custkey,
+             int *c_region, int c_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 25 * 25);
+  int ht_entries = 6;
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q42_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+  q42_kernel2<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_len, ht_c, c_len);
+  q42_kernel3<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_mfgr, p_partkey, p_category, p_len, ht_p, p_len);
+  q42_kernel4<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q42_kernel5<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_custkey, lo_suppkey, lo_revenue,
+      lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d,
+      d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(ht_d);
+  CLEANUP(ht_s);
+  CLEANUP(ht_c);
+  CLEANUP(ht_p);
+  CLEANUP(res);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q43_kernel5(int *lo_orderdate, int *lo_partkey, int *lo_custkey,
+                            int *lo_suppkey, int *lo_revenue,
+                            int *lo_supplycost, int lo_len, int *ht_p,
+                            int p_len, int *ht_s, int s_len, int *ht_c,
+                            int c_len, int *ht_d, int d_len, int *res)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int s_city[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_city, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags))
+  {
+    return;
+  }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_supplycost + tile_offset, items, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)
+    {
+      if (selection_flags[ITEM])
+      {
+        /*int hash = (category[ITEM] * 7 * 25 + s_nation[ITEM] * 7 +
+         * (year[ITEM] - 1992)) % ((1998-1992+1) * 25 * 55);*/
+        int hash = ((year[ITEM] - 1992) * 250 * 1000 + s_city[ITEM] * 1000 +
+                    brand[ITEM]) %
+                   ((1998 - 1992 + 1) * 250 * 1000);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = s_city[ITEM];
+        res[hash * 4 + 2] = brand[ITEM];
+        atomicAdd(&res[hash * 4 + 3], (revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q43_kernel2(int *filter_col, int *dim_key, int num_tuples,
+                            int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q43_kernel3(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q43_kernel1(int *filter_col, int *dim_key, int *dim_val,
+                            int num_tuples, int *hash_table, int num_slots)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void q43_kernel4(int *dim_key, int *dim_val, int num_tuples,
+                            int *hash_table, int num_slots, int val_min)
+{
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1)
+  {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1998, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+void run_q43(int *lo_orderdate, int *lo_custkey, int *lo_partkey,
+             int *lo_suppkey, int *lo_revenue, int *lo_supplycost, int lo_len,
+             int *d_datekey, int *d_year, int d_len, int *p_partkey,
+             int *p_category, int *p_brand1, int p_len, int *s_suppkey,
+             int *s_nation, int *s_city, int s_len, int *c_custkey,
+             int *c_region, int c_len,
+             cub::CachingDeviceAllocator &g_allocator)
+{
+  // Setup
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 1000);
+  int ht_entries = 4;
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  // Run queries
+  int tile_items = 128 * 4;
+  int d_val_min = 19920101;
+  q43_kernel1<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_nation, s_suppkey, s_city, s_len, ht_s, s_len);
+  q43_kernel2<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_len, ht_c, c_len);
+  q43_kernel3<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+  q43_kernel4<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  q43_kernel5<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_custkey, lo_suppkey, lo_revenue,
+      lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d,
+      d_val_len, res);
+
+  // Finalize results
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  delete[] h_res;
+  CLEANUP(ht_d);
+  CLEANUP(ht_s);
+  CLEANUP(ht_c);
+  CLEANUP(ht_p);
+  CLEANUP(res);
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Load host data
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN);
+  int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN);
+  int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+  int *h_p_category = loadColumn<int>("p_category", P_LEN);
+  int *h_p_mfgr = loadColumn<int>("p_mfgr", P_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_nation = loadColumn<int>("c_nation", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+  int *h_c_city = loadColumn<int>("c_city", C_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+  int *h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+  // Load device data
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator);
+  int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator);
+  int *d_lo_extendedprice =
+      loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+  int *d_lo_supplycost = loadToGPU<int>(h_lo_supplycost, LO_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_brand1 = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+  int *d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+  int *d_p_mfgr = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+  int *d_c_nation = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+  int *d_c_city = loadToGPU<int>(h_c_city, C_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+  int *d_d_yearmonthnum = loadToGPU<int>(h_d_yearmonthnum, D_LEN, g_allocator);
+
+  // Run queries
+  std::vector<int> seeds = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  auto rng = std::default_random_engine{};
+  std::shuffle(std::begin(seeds), std::end(seeds), rng);
+  for (int t = 0; t < num_trials; t++)
+  {
+    for (int s : seeds)
+    {
+      switch (s)
+      {
+      case 0:
+        run_q11(d_lo_orderdate, d_lo_discount, d_lo_quantity,
+                d_lo_extendedprice, LO_LEN, g_allocator);
+        break;
+      case 1:
+        run_q12(d_lo_orderdate, d_lo_discount, d_lo_quantity,
+                d_lo_extendedprice, LO_LEN, g_allocator);
+        break;
+      case 2:
+        run_q13(d_lo_orderdate, d_lo_discount, d_lo_quantity,
+                d_lo_extendedprice, LO_LEN, g_allocator);
+        break;
+      case 3:
+        run_q21(d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue,
+                LO_LEN, d_p_partkey, d_p_brand1, d_p_category, P_LEN,
+                d_d_datekey, d_d_year, D_LEN, d_s_suppkey, d_s_region, S_LEN,
+                g_allocator);
+        break;
+      case 4:
+        run_q22(d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue,
+                LO_LEN, d_p_partkey, d_p_brand1, P_LEN, d_d_datekey, d_d_year,
+                D_LEN, d_s_suppkey, d_s_region, S_LEN, g_allocator);
+        break;
+      case 5:
+        run_q23(d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue,
+                LO_LEN, d_p_partkey, d_p_brand1, P_LEN, d_d_datekey, d_d_year,
+                D_LEN, d_s_suppkey, d_s_region, S_LEN, g_allocator);
+        break;
+      case 6:
+        run_q31(d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue,
+                LO_LEN, d_d_datekey, d_d_year, D_LEN, d_s_suppkey, d_s_region,
+                d_s_nation, S_LEN, d_c_custkey, d_c_region, d_c_nation, C_LEN,
+                g_allocator);
+        break;
+      case 7:
+        run_q32(d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue,
+                LO_LEN, d_d_datekey, d_d_year, D_LEN, d_s_suppkey, d_s_nation,
+                d_s_city, S_LEN, d_c_custkey, d_c_nation, d_c_city, C_LEN,
+                g_allocator);
+        break;
+      case 8:
+        run_q33(d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue,
+                LO_LEN, d_d_datekey, d_d_year, D_LEN, d_s_suppkey, d_s_city,
+                S_LEN, d_c_custkey, d_c_city, C_LEN, g_allocator);
+        break;
+      case 9:
+        run_q34(d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue,
+                LO_LEN, d_d_datekey, d_d_year, d_d_yearmonthnum, D_LEN,
+                d_s_suppkey, d_s_city, S_LEN, d_c_custkey, d_c_city, C_LEN,
+                g_allocator);
+        break;
+      case 10:
+        run_q41(d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey,
+                d_lo_revenue, d_lo_supplycost, LO_LEN, d_d_datekey, d_d_year,
+                D_LEN, d_p_partkey, d_p_mfgr, P_LEN, d_s_suppkey, d_s_region,
+                S_LEN, d_c_custkey, d_c_region, d_c_nation, C_LEN, g_allocator);
+        break;
+      case 11:
+        run_q42(d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey,
+                d_lo_revenue, d_lo_supplycost, LO_LEN, d_d_datekey, d_d_year,
+                D_LEN, d_p_partkey, d_p_mfgr, d_p_category, P_LEN, d_s_suppkey,
+                d_s_region, d_s_nation, S_LEN, d_c_custkey, d_c_region, C_LEN,
+                g_allocator);
+        break;
+      case 12:
+        run_q43(d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey,
+                d_lo_revenue, d_lo_supplycost, LO_LEN, d_d_datekey, d_d_year,
+                D_LEN, d_p_partkey, d_p_category, d_p_brand1, P_LEN,
+                d_s_suppkey, d_s_nation, d_s_city, S_LEN, d_c_custkey,
+                d_c_region, C_LEN, g_allocator);
+        break;
+      }
+    }
+  }
+}
diff --git a/crystal-opt/src/ssb/gpu_utils.h b/crystal-opt/src/ssb/gpu_utils.h
new file mode 100644
index 0000000..1af7526
--- /dev/null
+++ b/crystal-opt/src/ssb/gpu_utils.h
@@ -0,0 +1,57 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#define SETUP_TIMING() cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
+
+#define TIME_FUNC(f,t) { \
+    cudaEventRecord(start, 0); \
+    f; \
+    cudaEventRecord(stop, 0); \
+    cudaEventSynchronize(stop); \
+    cudaEventElapsedTime(&t, start,stop); \
+}
+
+#define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec))
+
+#define ALLOCATE(vec,size) CubDebugExit(g_allocator.DeviceAllocate((void**)&vec, size))
+
+template<typename T>
+T* loadToGPU(T* src, int numEntries, cub::CachingDeviceAllocator& g_allocator) {
+  T* dest;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&dest, sizeof(T) * numEntries));
+  CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * numEntries, cudaMemcpyHostToDevice));
+  return dest;
+}
+
+#define TILE_SIZE (BLOCK_THREADS * ITEMS_PER_THREAD)
+
+#define CHECK_ERROR() { \
+  cudaDeviceSynchronize(); \
+  cudaError_t error = cudaGetLastError(); \
+  if(error != cudaSuccess) \
+  { \
+    printf("CUDA error: %s\n", cudaGetErrorString(error)); \
+    exit(-1); \
+  } \
+}
diff --git a/crystal-opt/src/ssb/q11.cu b/crystal-opt/src/ssb/q11.cu
new file mode 100644
index 0000000..0d53d07
--- /dev/null
+++ b/crystal-opt/src/ssb/q11.cu
@@ -0,0 +1,204 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel(int *lo_orderdate, int *lo_discount,
+                            int *lo_quantity, int *lo_extendedprice,
+                            int lo_num_entries, unsigned long long *revenue) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredGT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19930000, selection_flags, num_tile_items);
+  BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940000, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_quantity + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 25, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_discount + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 3, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate =
+      BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum,
+                                                           (long long *)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+float runQuery(int *lo_orderdate, int *lo_discount, int *lo_quantity,
+               int *lo_extendedprice, int lo_num_entries,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  unsigned long long *d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_sum, sizeof(long long)));
+
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128 * 4;
+  int num_blocks = (lo_num_entries + tile_items - 1) / tile_items;
+  QueryKernel<128, 4><<<num_blocks, 128>>>(lo_orderdate, lo_discount,
+                                           lo_quantity, lo_extendedprice,
+                                           lo_num_entries, d_sum);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  unsigned long long revenue;
+  CubDebugExit(
+      cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Revenue: " << revenue << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  CLEANUP(d_sum);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN);
+  int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN);
+  int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+  cout << "LO_LEN " << LO_LEN << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator);
+  int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator);
+  int *d_lo_extendedprice =
+      loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator);
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity,
+                          d_lo_extendedprice, LO_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":11"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q12.cu b/crystal-opt/src/ssb/q12.cu
new file mode 100644
index 0000000..572fd47
--- /dev/null
+++ b/crystal-opt/src/ssb/q12.cu
@@ -0,0 +1,206 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void DeviceSelectIf(int *lo_orderdate, int *lo_discount,
+                               int *lo_quantity, int *lo_extendedprice,
+                               int lo_num_entries,
+                               unsigned long long *revenue) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940101, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940131, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_quantity + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 26, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 35, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_discount + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 4, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 6, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate =
+      BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum,
+                                                           (long long *)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+float runQuery(int *lo_orderdate, int *lo_discount, int *lo_quantity,
+               int *lo_extendedprice, int lo_num_entries,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  unsigned long long *d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_sum, sizeof(long long)));
+
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128 * 4;
+  DeviceSelectIf<128, 4>
+      <<<(lo_num_entries + tile_items - 1) / tile_items, 128>>>(
+          lo_orderdate, lo_discount, lo_quantity, lo_extendedprice,
+          lo_num_entries, d_sum);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  unsigned long long revenue;
+  CubDebugExit(
+      cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Revenue: " << revenue << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  CLEANUP(d_sum);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN);
+  int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN);
+  int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator);
+  int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator);
+  int *d_lo_extendedprice =
+      loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator);
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity,
+                          d_lo_extendedprice, LO_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":12"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q13.cu b/crystal-opt/src/ssb/q13.cu
new file mode 100644
index 0000000..cb33971
--- /dev/null
+++ b/crystal-opt/src/ssb/q13.cu
@@ -0,0 +1,207 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void DeviceSelectIf(int *lo_orderdate, int *lo_discount,
+                               int *lo_quantity, int *lo_extendedprice,
+                               int lo_num_entries,
+                               unsigned long long *revenue) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940204, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 19940210, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_quantity + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 26, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 35, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_discount + tile_offset, items, num_tile_items, selection_flags);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 5, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 7, selection_flags, num_tile_items);
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate =
+      BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum,
+                                                           (long long *)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+float runQuery(int *lo_orderdate, int *lo_discount, int *lo_quantity,
+               int *lo_extendedprice, int lo_num_entries,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  unsigned long long *d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_sum, sizeof(long long)));
+
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128 * 4;
+  TIME_FUNC((DeviceSelectIf<128, 4>
+             <<<(lo_num_entries + tile_items - 1) / tile_items, 128>>>(
+                 lo_orderdate, lo_discount, lo_quantity, lo_extendedprice,
+                 lo_num_entries, d_sum)),
+            time_query);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  unsigned long long revenue;
+  CubDebugExit(
+      cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Revenue: " << revenue << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  CLEANUP(d_sum);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN);
+  int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN);
+  int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator);
+  int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator);
+  int *d_lo_extendedprice =
+      loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator);
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity,
+                          d_lo_extendedprice, LO_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":13"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q21.cu b/crystal-opt/src/ssb/q21.cu
new file mode 100644
index 0000000..3fe9a52
--- /dev/null
+++ b/crystal-opt/src/ssb/q21.cu
@@ -0,0 +1,336 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+                      int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                      int *ht_p, int p_len, int *ht_d, int d_len, int *res) {
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (brand[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * (5 * 5 * 40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+               int *lo_revenue, int lo_len, int *p_partkey, int *p_brand1,
+               int *p_category, int p_len, int *d_datekey, int *d_year,
+               int d_len, int *s_suppkey, int *s_region, int s_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * (5 * 5 * 40));
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_p, p_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+           << reinterpret_cast<unsigned long long *>(&h_res[4 * i + 2])[0]
+           << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  CLEANUP(res);
+  CLEANUP(ht_d);
+  CLEANUP(ht_p);
+  CLEANUP(ht_s);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+  int *h_p_category = loadColumn<int>("p_category", P_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_brand1 = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+  int *d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_partkey, d_lo_suppkey,
+                          d_lo_revenue, LO_LEN, d_p_partkey, d_p_brand1,
+                          d_p_category, P_LEN, d_d_datekey, d_d_year, D_LEN,
+                          d_s_suppkey, d_s_region, S_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":21"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q22.cu b/crystal-opt/src/ssb/q22.cu
new file mode 100644
index 0000000..6640329
--- /dev/null
+++ b/crystal-opt/src/ssb/q22.cu
@@ -0,0 +1,334 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+                      int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                      int *ht_p, int p_len, int *ht_d, int d_len, int *res) {
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (brand[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * (5 * 5 * 40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 260, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 267, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+               int *lo_revenue, int lo_len, int *p_partkey, int *p_brand1,
+               int p_len, int *d_datekey, int *d_year, int d_len,
+               int *s_suppkey, int *s_region, int s_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 1000);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_p, p_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+           << reinterpret_cast<unsigned long long *>(&h_res[4 * i + 2])[0]
+           << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  CLEANUP(ht_d);
+  CLEANUP(ht_p);
+  CLEANUP(ht_s);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_brand1 = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query =
+        runQuery(d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue,
+                 LO_LEN, d_p_partkey, d_p_brand1, P_LEN, d_d_datekey, d_d_year,
+                 D_LEN, d_s_suppkey, d_s_region, S_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":22"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q23.cu b/crystal-opt/src/ssb/q23.cu
new file mode 100644
index 0000000..e9b4273
--- /dev/null
+++ b/crystal-opt/src/ssb/q23.cu
@@ -0,0 +1,326 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+                      int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                      int *ht_p, int p_len, int *ht_d, int d_len, int *res) {
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (brand[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * (5 * 5 * 40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 260, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_partkey, int *lo_suppkey,
+               int *lo_revenue, int lo_len, int *p_partkey, int *p_brand1,
+               int p_len, int *d_datekey, int *d_year, int d_len,
+               int *s_suppkey, int *s_region, int s_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 1000);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_p, p_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+           << reinterpret_cast<unsigned long long *>(&h_res[4 * i + 2])[0]
+           << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_brand1 = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query =
+        runQuery(d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue,
+                 LO_LEN, d_p_partkey, d_p_brand1, P_LEN, d_d_datekey, d_d_year,
+                 D_LEN, d_s_suppkey, d_s_region, S_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":23"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q31.cu b/crystal-opt/src/ssb/q31.cu
new file mode 100644
index 0000000..70eb4ee
--- /dev/null
+++ b/crystal-opt/src/ssb/q31.cu
@@ -0,0 +1,349 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                      int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                      int *ht_c, int c_len, int *ht_d, int d_len, int *res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 25 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 25 * 25);
+        res[hash * 6] = year[ITEM];
+        res[hash * 6 + 1] = c_nation[ITEM];
+        res[hash * 6 + 2] = s_nation[ITEM];
+        /*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 6 + 4]),
+                  (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1992, selection_flags, num_tile_items);
+  BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+               int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+               int d_len, int *s_suppkey, int *s_region, int *s_nation,
+               int s_len, int *c_custkey, int *c_region, int *c_nation,
+               int c_len, cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 25 * 25);
+  int res_array_size = res_size * 6;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[6 * i] != 0) {
+      cout << h_res[6 * i] << " " << h_res[6 * i + 1] << " " << h_res[6 * i + 2]
+           << " "
+           << reinterpret_cast<unsigned long long *>(&h_res[6 * i + 4])[0]
+           << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_nation = loadColumn<int>("c_nation", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+  int *d_c_nation = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_d_datekey, d_d_year, D_LEN, d_s_suppkey, d_s_region, d_s_nation,
+        S_LEN, d_c_custkey, d_c_region, d_c_nation, C_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":31"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q32.cu b/crystal-opt/src/ssb/q32.cu
new file mode 100644
index 0000000..cf9f16a
--- /dev/null
+++ b/crystal-opt/src/ssb/q32.cu
@@ -0,0 +1,343 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                      int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                      int *ht_c, int c_len, int *ht_d, int d_len, int *res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 250 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1992, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+               int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+               int d_len, int *s_suppkey, int *s_nation, int *s_city, int s_len,
+               int *c_custkey, int *c_nation, int *c_city, int c_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_nation, s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_nation, c_custkey, c_city, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " " << h_res[4 * i + 2]
+           << " " << h_res[4 * i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_nation = loadColumn<int>("c_nation", C_LEN);
+  int *h_c_city = loadColumn<int>("c_city", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_nation = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+  int *d_c_city = loadToGPU<int>(h_c_city, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_custkey, d_lo_suppkey,
+                          d_lo_revenue, LO_LEN, d_d_datekey, d_d_year, D_LEN,
+                          d_s_suppkey, d_s_nation, d_s_city, S_LEN, d_c_custkey,
+                          d_c_nation, d_c_city, C_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":32"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q33.cu b/crystal-opt/src/ssb/q33.cu
new file mode 100644
index 0000000..1048f79
--- /dev/null
+++ b/crystal-opt/src/ssb/q33.cu
@@ -0,0 +1,317 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                      int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                      int *ht_c, int c_len, int *ht_d, int d_len, int *res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 250 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1992, selection_flags, num_tile_items);
+  BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+               int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+               int d_len, int *s_suppkey, int *s_city, int s_len,
+               int *c_custkey, int *c_city, int c_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_custkey, c_city, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " " << h_res[4 * i + 2]
+           << " " << h_res[4 * i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_city = loadColumn<int>("c_city", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_city = loadToGPU<int>(h_c_city, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query =
+        runQuery(d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue,
+                 LO_LEN, d_d_datekey, d_d_year, D_LEN, d_s_suppkey, d_s_city,
+                 S_LEN, d_c_custkey, d_c_city, C_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":33"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q34.cu b/crystal-opt/src/ssb/q34.cu
new file mode 100644
index 0000000..7326075
--- /dev/null
+++ b/crystal-opt/src/ssb/q34.cu
@@ -0,0 +1,365 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+                      int *lo_revenue, int lo_len, int *ht_s, int s_len,
+                      int *ht_c, int c_len, int *ht_d, int d_len, int *res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 250 * 7 + c_nation[ITEM] * 7 +
+                    (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 199712, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, 19920101,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_custkey, int *lo_suppkey,
+               int *lo_revenue, int lo_len, int *d_datekey, int *d_year,
+               int *d_yearmonthnum, int d_len, int *s_suppkey, int *s_city,
+               int s_len, int *c_custkey, int *c_city, int c_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_custkey, c_city, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_yearmonthnum, d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  int *d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_sum, sizeof(int)));
+
+  cudaMemset(d_sum, 0, sizeof(int));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len,
+      ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " " << h_res[4 * i + 2]
+           << " " << h_res[4 * i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+int murmur(int k) {
+  int h = 0xcd2e2c20;
+  const int len = 4;
+  k *= 0xcc9e2d51;
+  k = (k << 15) | (k >> 17);
+  k *= 0x1b873593;
+  h ^= k;
+  h = (h << 13) | (h >> 19);
+  h = (h * 5) + 0xe6546b64;
+  h ^= len;
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+  return h;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+  int *h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_city = loadColumn<int>("c_city", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+  int *d_d_yearmonthnum = loadToGPU<int>(h_d_yearmonthnum, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_city = loadToGPU<int>(h_c_city, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_custkey, d_lo_suppkey,
+                          d_lo_revenue, LO_LEN, d_d_datekey, d_d_year,
+                          d_d_yearmonthnum, D_LEN, d_s_suppkey, d_s_city, S_LEN,
+                          d_c_custkey, d_c_city, C_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":34"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q41.cu b/crystal-opt/src/ssb/q41.cu
new file mode 100644
index 0000000..7922fcd
--- /dev/null
+++ b/crystal-opt/src/ssb/q41.cu
@@ -0,0 +1,438 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_partkey, int *lo_custkey,
+                      int *lo_suppkey, int *lo_revenue, int *lo_supplycost,
+                      int lo_len, int *ht_p, int p_len, int *ht_s, int s_len,
+                      int *ht_c, int c_len, int *ht_d, int d_len, int *res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_supplycost + tile_offset, items, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) %
+                   ((1998 - 1992 + 1) * 25);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        /*atomicAdd(&res[hash * 4 + 2], (1));*/
+        /*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+         * (long long)(1));*/
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 4 + 2]),
+                  (long long)(revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *filter_col, int *dim_key, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                      num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_custkey, int *lo_partkey,
+               int *lo_suppkey, int *lo_revenue, int *lo_supplycost, int lo_len,
+               int *d_datekey, int *d_year, int d_len, int *p_partkey,
+               int *p_mfgr, int p_len, int *s_suppkey, int *s_region, int s_len,
+               int *c_custkey, int *c_region, int *c_nation, int c_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  /* int *s_res = new int[s_len * 2]; */
+  /* CubDebugExit( */
+  /*     cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost)); */
+
+  build_hashtable_c<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  /* int *c_res = new int[c_len * 2]; */
+  /* CubDebugExit( */
+  /*     cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost)); */
+
+  build_hashtable_p<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_mfgr, p_partkey, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  /* int *p_res = new int[p_len * 2]; */
+  /* CubDebugExit( */
+  /*     cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost)); */
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+#if 0
+  int *h_ht_s = new int[s_len * 2];
+  int *h_ht_c = new int[c_len * 2];
+  int *h_ht_p = new int[p_len * 2];
+  int *h_ht_d = new int[d_val_len * 2];
+
+  int num_s = 0 , num_c = 0, num_d = 0, num_p = 0;
+
+  CubDebugExit(cudaMemcpy(h_ht_s, ht_s, 2 * s_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<s_len; i++) if (h_ht_s[i*2] != 0) num_s += 1;
+
+  cout << "Num Matched" << " " << num_s << " " << s_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_d, ht_d, 2 * d_val_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<d_val_len; i++) if (h_ht_d[i*2] != 0) num_d += 1;
+
+  cout << "Num Matched" << " " << num_d << " " << d_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_c, ht_c, 2 * c_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<c_len; i++) if (h_ht_c[i*2] != 0) num_c += 1;
+
+  cout << "Num Matched" << " " << num_c << " " << c_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_p, ht_p, 2 * p_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<p_len; i++) if (h_ht_p[i*2] != 0) num_p += 1;
+
+  cout << "Num Matched" << " " << num_p << " " << p_len << endl;
+#endif
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 25);
+  int ht_entries = 4; // int,int,long long
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_custkey, lo_suppkey, lo_revenue,
+      lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d,
+      d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+           << reinterpret_cast<unsigned long long *>(&h_res[4 * i + 2])[0]
+           << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+  int *h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_mfgr = loadColumn<int>("p_mfgr", P_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+  int *h_c_nation = loadColumn<int>("c_nation", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+  int *d_lo_supplycost = loadToGPU<int>(h_lo_supplycost, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_mfgr = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+  int *d_c_nation = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_custkey, d_lo_partkey,
+                          d_lo_suppkey, d_lo_revenue, d_lo_supplycost, LO_LEN,
+                          d_d_datekey, d_d_year, D_LEN, d_p_partkey, d_p_mfgr,
+                          P_LEN, d_s_suppkey, d_s_region, S_LEN, d_c_custkey,
+                          d_c_region, d_c_nation, C_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":41"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q42.cu b/crystal-opt/src/ssb/q42.cu
new file mode 100644
index 0000000..08f8852
--- /dev/null
+++ b/crystal-opt/src/ssb/q42.cu
@@ -0,0 +1,411 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_partkey, int *lo_custkey,
+                      int *lo_suppkey, int *lo_revenue, int *lo_supplycost,
+                      int lo_len, int *ht_p, int p_len, int *ht_s, int s_len,
+                      int *ht_c, int c_len, int *ht_d, int d_len, int *res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int category[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, category, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_supplycost + tile_offset, items, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        /*int hash = (category[ITEM] * 7 * 25 + s_nation[ITEM] * 7 +
+         * (year[ITEM] - 1992)) % ((1998-1992+1) * 25 * 55);*/
+        int hash = ((year[ITEM] - 1992) * 25 * 25 + s_nation[ITEM] * 25 +
+                    category[ITEM]) %
+                   ((1998 - 1992 + 1) * 25 * 25);
+        res[hash * 6] = year[ITEM];
+        res[hash * 6 + 1] = s_nation[ITEM];
+        res[hash * 6 + 2] = category[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long *>(&res[hash * 6 + 4]),
+                  (long long)(revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *filter_col, int *dim_key, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags,
+                                                    num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                      num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1998, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_custkey, int *lo_partkey,
+               int *lo_suppkey, int *lo_revenue, int *lo_supplycost, int lo_len,
+               int *d_datekey, int *d_year, int d_len, int *p_partkey,
+               int *p_mfgr, int *p_category, int p_len, int *s_suppkey,
+               int *s_region, int *s_nation, int s_len, int *c_custkey,
+               int *c_region, int c_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_mfgr, p_partkey, p_category, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 25 * 25);
+  int ht_entries = 6;
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_custkey, lo_suppkey, lo_revenue,
+      lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d,
+      d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[6 * i] != 0) {
+      cout << h_res[6 * i] << " " << h_res[6 * i + 1] << " " << h_res[6 * i + 2]
+           << " "
+           << reinterpret_cast<unsigned long long *>(&h_res[6 * i + 4])[0]
+           << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+  int *h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_mfgr = loadColumn<int>("p_mfgr", P_LEN);
+  int *h_p_category = loadColumn<int>("p_category", P_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+  int *d_lo_supplycost = loadToGPU<int>(h_lo_supplycost, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_mfgr = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+  int *d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey, d_lo_revenue,
+        d_lo_supplycost, LO_LEN, d_d_datekey, d_d_year, D_LEN, d_p_partkey,
+        d_p_mfgr, d_p_category, P_LEN, d_s_suppkey, d_s_region, d_s_nation,
+        S_LEN, d_c_custkey, d_c_region, C_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":42"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/q43.cu b/crystal-opt/src/ssb/q43.cu
new file mode 100644
index 0000000..bee27c2
--- /dev/null
+++ b/crystal-opt/src/ssb/q43.cu
@@ -0,0 +1,405 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <curand.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "cub/test/test_util.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool g_verbose = false; // Whether to display input/output to console
+cub::CachingDeviceAllocator
+    g_allocator(true); // Caching allocator for device memory
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int *lo_orderdate, int *lo_partkey, int *lo_custkey,
+                      int *lo_suppkey, int *lo_revenue, int *lo_supplycost,
+                      int lo_len, int *ht_p, int p_len, int *ht_s, int s_len,
+                      int *ht_c, int c_len, int *ht_d, int d_len, int *res) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int s_city[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset,
+                                                  items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, s_city, selection_flags, ht_s, s_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_custkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, ht_c, c_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_partkey + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, brand, selection_flags, ht_p, p_len, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_orderdate + tile_offset, items, num_tile_items, selection_flags);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, year, selection_flags, ht_d, d_len, 19920101, num_tile_items);
+  if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+  BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      lo_supplycost + tile_offset, items, num_tile_items, selection_flags);
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        /*int hash = (category[ITEM] * 7 * 25 + s_nation[ITEM] * 7 +
+         * (year[ITEM] - 1992)) % ((1998-1992+1) * 25 * 55);*/
+        int hash = ((year[ITEM] - 1992) * 250 * 1000 + s_city[ITEM] * 1000 +
+                    brand[ITEM]) %
+                   ((1998 - 1992 + 1) * 250 * 1000);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = s_city[ITEM];
+        res[hash * 4 + 2] = brand[ITEM];
+        atomicAdd(&res[hash * 4 + 3], (revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *filter_col, int *dim_key, int num_tuples,
+                                  int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int *dim_val,
+                                  int num_tuples, int *hash_table,
+                                  int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset,
+                                                  items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags,
+                                                    num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items,
+                                                  num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples,
+                                  int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items,
+                                                  num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1997, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items, 1998, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2,
+                                                  num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+      items2, items, selection_flags, hash_table, num_slots, val_min,
+      num_tile_items);
+}
+
+float runQuery(int *lo_orderdate, int *lo_custkey, int *lo_partkey,
+               int *lo_suppkey, int *lo_revenue, int *lo_supplycost, int lo_len,
+               int *d_datekey, int *d_year, int d_len, int *p_partkey,
+               int *p_category, int *p_brand1, int p_len, int *s_suppkey,
+               int *s_nation, int *s_city, int s_len, int *c_custkey,
+               int *c_region, int c_len,
+               cub::CachingDeviceAllocator &g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&ht_p, 2 * p_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  int tile_items = 128 * 4;
+  build_hashtable_s<128, 4><<<(s_len + tile_items - 1) / tile_items, 128>>>(
+      s_nation, s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128, 4><<<(c_len + tile_items - 1) / tile_items, 128>>>(
+      c_region, c_custkey, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128, 4><<<(p_len + tile_items - 1) / tile_items, 128>>>(
+      p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128, 4><<<(d_len + tile_items - 1) / tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998 - 1992 + 1) * 250 * 1000);
+  int ht_entries = 4;
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(
+      g_allocator.DeviceAllocate((void **)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128, 4><<<(lo_len + tile_items - 1) / tile_items, 128>>>(
+      lo_orderdate, lo_partkey, lo_custkey, lo_suppkey, lo_revenue,
+      lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d,
+      d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start, stop);
+
+  cout << "Result:" << endl;
+
+  int *h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int),
+                          cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  int res_count = 0;
+  for (int i = 0; i < res_size; i++) {
+    if (h_res[4 * i] != 0) {
+      cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " " << h_res[4 * i + 2]
+           << " " << h_res[4 * i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char **argv) {
+  int num_trials = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help")) {
+    printf("%s "
+           "[--t=<num trials>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_category = loadColumn<int>("p_category", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+  int *d_lo_supplycost = loadToGPU<int>(h_lo_supplycost, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+  int *d_p_brand1 = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey, d_lo_revenue,
+        d_lo_supplycost, LO_LEN, d_d_datekey, d_d_year, D_LEN, d_p_partkey,
+        d_p_category, d_p_brand1, P_LEN, d_s_suppkey, d_s_nation, d_s_city,
+        S_LEN, d_c_custkey, d_c_region, C_LEN, g_allocator);
+    cout << "{"
+         << "\"query\":43"
+         << ",\"time_query\":" << time_query << "}" << endl;
+  }
+
+  return 0;
+}
diff --git a/crystal-opt/src/ssb/ssb_utils.h b/crystal-opt/src/ssb/ssb_utils.h
new file mode 100644
index 0000000..a347b14
--- /dev/null
+++ b/crystal-opt/src/ssb/ssb_utils.h
@@ -0,0 +1,177 @@
+#include <fstream>
+#include <iostream>
+#include <string>
+
+/*#include <cuda.h>*/
+/*#include <cub/util_allocator.cuh>*/
+
+using namespace std;
+
+#define SF 10
+
+#define LOAD_TYPE 0
+
+#define BASE_PATH ""
+
+#if SF == 1
+#define DATA_DIR BASE_PATH "sf1_column_bin/"
+#define LO_LEN 6001171
+#define P_LEN 200000
+#define S_LEN 2000
+#define C_LEN 30000
+#define D_LEN 2556
+#elif SF == 2
+#define DATA_DIR BASE_PATH "sf2_column_bin/"
+#define LO_LEN 11998051
+#define P_LEN 400000
+#define S_LEN 4000
+#define C_LEN 60000
+#define D_LEN 2556
+#elif SF == 4
+#define DATA_DIR BASE_PATH "sf4_column_bin/"
+#define LO_LEN 23996670
+#define P_LEN 600000
+#define S_LEN 8000
+#define C_LEN 120000
+#define D_LEN 2556
+#elif SF == 8
+#define DATA_DIR BASE_PATH "sf8_column_bin/"
+#define LO_LEN 47989129
+#define P_LEN 800000
+#define S_LEN 16000
+#define C_LEN 240000
+#define D_LEN 2556
+#elif SF == 10
+#define DATA_DIR BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/s10_columnar/"
+#define LO_LEN 59986214
+#define P_LEN 800000
+#define S_LEN 20000
+#define C_LEN 300000
+#define D_LEN 2556
+#elif SF == 16
+#define DATA_DIR BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/s1_columnar/"
+#define LO_LEN 95988758
+#define P_LEN 1000000
+#define S_LEN 32000
+#define C_LEN 480000
+#define D_LEN 2556
+#elif SF == 32
+#define DATA_DIR BASE_PATH "sf32_column_bin/"
+#define LO_LEN 192000754
+#define P_LEN 1200000
+#define S_LEN 64000
+#define C_LEN 960000
+#define D_LEN 2556
+#elif SF == 64
+#define DATA_DIR BASE_PATH "sf64_column_bin/"
+#define LO_LEN 384016864
+#define P_LEN 1400000
+#define S_LEN 128000
+#define C_LEN 1920000
+#define D_LEN 2556
+#elif SF == 128
+#define DATA_DIR BASE_PATH "sf128_column_bin/"
+#define LO_LEN 768047048
+#define P_LEN 1600000
+#define S_LEN 256000
+#define C_LEN 3840000
+#define D_LEN 2556
+#else // 20
+#define DATA_DIR BASE_PATH "s20_columnar/"
+#define LO_LEN 119994746
+#define P_LEN 1000000
+#define S_LEN 40000
+#define C_LEN 600000
+#define D_LEN 2556
+#endif
+
+int index_of(string *arr, int len, string val) {
+  for (int i = 0; i < len; i++)
+    if (arr[i] == val)
+      return i;
+
+  return -1;
+}
+
+string lookup(string col_name) {
+  string lineorder[] = {"lo_orderkey",      "lo_linenumber",    "lo_custkey",
+                        "lo_partkey",       "lo_suppkey",       "lo_orderdate",
+                        "lo_orderpriority", "lo_shippriority",  "lo_quantity",
+                        "lo_extendedprice", "lo_ordtotalprice", "lo_discount",
+                        "lo_revenue",       "lo_supplycost",    "lo_tax",
+                        "lo_commitdate",    "lo_shipmode"};
+  string part[] = {"p_partkey", "p_name", "p_mfgr", "p_category", "p_brand1",
+                   "p_color",   "p_type", "p_size", "p_container"};
+  string supplier[] = {"s_suppkey", "s_name",   "s_address", "s_city",
+                       "s_nation",  "s_region", "s_phone"};
+  string customer[] = {"c_custkey", "c_name",   "c_address", "c_city",
+                       "c_nation",  "c_region", "c_phone",   "c_mktsegment"};
+  string date[] = {"d_datekey",
+                   "d_date",
+                   "d_dayofweek",
+                   "d_month",
+                   "d_year",
+                   "d_yearmonthnum",
+                   "d_yearmonth",
+                   "d_daynuminweek",
+                   "d_daynuminmonth",
+                   "d_daynuminyear",
+                   "d_sellingseason",
+                   "d_lastdayinweekfl",
+                   "d_lastdayinmonthfl",
+                   "d_holidayfl",
+                   "d_weekdayfl"};
+
+  if (col_name[0] == 'l') {
+    int index = index_of(lineorder, 17, col_name);
+    return "LINEORDER" + to_string(index);
+  } else if (col_name[0] == 's') {
+    int index = index_of(supplier, 7, col_name);
+    return "SUPPLIER" + to_string(index);
+  } else if (col_name[0] == 'c') {
+    int index = index_of(customer, 8, col_name);
+    return "CUSTOMER" + to_string(index);
+  } else if (col_name[0] == 'p') {
+    int index = index_of(part, 9, col_name);
+    return "PART" + to_string(index);
+  } else if (col_name[0] == 'd') {
+    int index = index_of(date, 15, col_name);
+    return "DDATE" + to_string(index);
+  }
+
+  return "";
+}
+
+template <typename T> T *loadColumn(string col_name, int num_entries) {
+  T *h_col = new T[num_entries];
+  string filename = DATA_DIR + lookup(col_name);
+  ifstream colData(filename.c_str(), ios::in | ios::binary);
+  if (!colData) {
+    return NULL;
+  }
+
+  colData.read((char *)h_col, num_entries * sizeof(T));
+  return h_col;
+}
+
+template <typename T>
+int storeColumn(string col_name, int num_entries, int *h_col) {
+  string filename = DATA_DIR + lookup(col_name);
+  ofstream colData(filename.c_str(), ios::out | ios::binary);
+  if (!colData) {
+    return -1;
+  }
+
+  colData.write((char *)h_col, num_entries * sizeof(T));
+  return 0;
+}
+
+/*int main() {*/
+// int *h_col = new int[10];
+// for (int i=0; i<10; i++) h_col[i] = i;
+// storeColumn<int>("test", 10, h_col);
+// int *l_col = loadColumn<int>("test", 10);
+// for (int i=0; i<10; i++) cout << l_col[i] << " ";
+// cout << endl;
+// return 0;
+/*}*/
\ No newline at end of file
diff --git a/crystal/CMakeLists.txt b/crystal/CMakeLists.txt
new file mode 100644
index 0000000..2721300
--- /dev/null
+++ b/crystal/CMakeLists.txt
@@ -0,0 +1,3 @@
+# Source : -------------------------------------------------------------------------------------------------------------
+add_subdirectory(src)
+
diff --git a/crystal/LICENSE b/crystal/LICENSE
new file mode 100644
index 0000000..beb8041
--- /dev/null
+++ b/crystal/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Anil Shanbhag
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crystal/README.md b/crystal/README.md
new file mode 100644
index 0000000..47bd715
--- /dev/null
+++ b/crystal/README.md
@@ -0,0 +1,79 @@
+Crystal GPU Library
+=================
+
+The Crystal library implements a collection of block-wide device functions that can be used to implement high performance implementations of SQL queries on GPUs.
+
+The package contains:
+
+* Crystal: `crystal/` contains the block-wide device functions
+* Implementations: `src/` contains SQL query operator implementations and implementations of 13 queries of the Star Schema Benchmark
+
+For full details of the Crystal, see our [paper](http://anilshanbhag.in/static/papers/crystal_sigmod20.pdf)
+
+```
+@inproceedings{shanbhag2020crystal,
+  author = {Shanbhag, Anil and Madden, Samuel and Yu, Xiangyao},
+  title = {A Study of the Fundamental Performance Characteristics of GPUs and CPUs for Database Analytics},
+  year = {2020},
+  url = {https://doi.org/10.1145/3318464.3380595},
+  doi = {10.1145/3318464.3380595},
+  booktitle = {Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data},
+  pages = {1617–1632},
+  numpages = {16},
+  location = {Portland, OR, USA},
+  series = {SIGMOD ’20}
+}
+```
+
+Usage
+----
+
+To use Crystal:
+
+* Copy out the `crystal` directory into your project.
+* Include Crystal
+```
+#include "crystal/crystal.cuh"
+```
+* Add the crystal directory to your include path
+
+To run the operator implementations:
+
+* Compile and run the operator. E.g.,
+```
+make bin/ops/project
+./bin/ops/project
+```
+
+To run the Star Schema Benchmark implementation:
+
+* Generate the test dataset
+
+```
+cd test/
+
+# Generate the test generator / transformer binaries
+cd ssb/dbgen
+make
+cd ../loader
+make 
+cd ../../
+
+# Generate the test data and transform into columnar layout
+# Substitute <SF> with appropriate scale factor (eg: 1)
+python util.py ssb <SF> gen
+python util.py ssb <SF> transform
+```
+
+* Configure the benchmark settings
+```
+cd src/ssb/
+# Edit SF and BASE_PATH in ssb_utils.h
+```
+
+* To run a query, say run q11
+```
+make bin/ssb/q11
+./bin/ssb/q11
+```
+
diff --git a/crystal/src/CMakeLists.txt b/crystal/src/CMakeLists.txt
new file mode 100644
index 0000000..466035d
--- /dev/null
+++ b/crystal/src/CMakeLists.txt
@@ -0,0 +1,43 @@
+add_library(crystal STATIC ops/join.cu ops/project.cu)
+target_include_directories(crystal PUBLIC ops)
+target_include_directories(crystal PUBLIC ssb)
+target_include_directories(crystal PUBLIC crystal)
+
+add_executable(crystal_q11 ssb/q11.cu)
+target_link_libraries(crystal_q11 crystal)
+
+add_executable(crystal_q12 ssb/q12.cu)
+target_link_libraries(crystal_q12 crystal)
+
+add_executable(crystal_q13 ssb/q13.cu)
+target_link_libraries(crystal_q13 crystal)
+
+add_executable(crystal_q21 ssb/q21.cu)
+target_link_libraries(crystal_q21 crystal)
+
+add_executable(crystal_q22 ssb/q22.cu)
+target_link_libraries(crystal_q22 crystal)
+
+add_executable(crystal_q23 ssb/q23.cu)
+target_link_libraries(crystal_q23 crystal)
+
+add_executable(crystal_q31 ssb/q31.cu)
+target_link_libraries(crystal_q31 crystal)
+
+add_executable(crystal_q32 ssb/q32.cu)
+target_link_libraries(crystal_q32 crystal)
+
+add_executable(crystal_q33 ssb/q33.cu)
+target_link_libraries(crystal_q33 crystal)
+
+add_executable(crystal_q34 ssb/q34.cu)
+target_link_libraries(crystal_q34 crystal)
+
+add_executable(crystal_q41 ssb/q41.cu)
+target_link_libraries(crystal_q41 crystal)
+
+add_executable(crystal_q42 ssb/q42.cu)
+target_link_libraries(crystal_q42 crystal)
+
+add_executable(crystal_q43 ssb/q43.cu)
+target_link_libraries(crystal_q43 crystal)
\ No newline at end of file
diff --git a/crystal/src/crystal/crystal.cuh b/crystal/src/crystal/crystal.cuh
new file mode 100644
index 0000000..ddce5b8
--- /dev/null
+++ b/crystal/src/crystal/crystal.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+// Block-wide functions
+#include "load.cuh"
+#include "pred.cuh"
+#include "store.cuh"
+#include "reduce.cuh"
+#include "join.cuh"
+
diff --git a/crystal/src/crystal/join.cuh b/crystal/src/crystal/join.cuh
new file mode 100644
index 0000000..eabb74c
--- /dev/null
+++ b/crystal/src/crystal/join.cuh
@@ -0,0 +1,311 @@
+#pragma once
+
+#define HASH(X,Y,Z) ((X-Z) % Y)
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(items[ITEM], ht_len, keys_min);
+
+      K slot = ht[hash];
+      if (slot != 0) {
+        selection_flags[ITEM] = 1;
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K slot = ht[hash];
+        if (slot != 0) {
+          selection_flags[ITEM] = 1;
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+      if (slot != 0) {
+        res[ITEM] = (slot >> 32);
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+        if (slot != 0) {
+          res[ITEM] = (slot >> 32);
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash], 0, keys[ITEM]);
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash], 0, items[ITEM]);
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+      ht[(hash << 1) + 1] = res[ITEM];
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+        ht[(hash << 1) + 1] = res[ITEM];
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
diff --git a/crystal/src/crystal/load.cuh b/crystal/src/crystal/load.cuh
new file mode 100644
index 0000000..bf18fe8
--- /dev/null
+++ b/crystal/src/crystal/load.cuh
@@ -0,0 +1,97 @@
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = inp;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
diff --git a/crystal/src/crystal/pred.cuh b/crystal/src/crystal/pred.cuh
new file mode 100644
index 0000000..491f96e
--- /dev/null
+++ b/crystal/src/crystal/pred.cuh
@@ -0,0 +1,335 @@
+#pragma once
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void InitFlags(
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = 1;
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPred(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAnd(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOr(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T>
+struct LessThan
+{
+  T compare;
+
+  __device__ __forceinline__
+  LessThan(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a < compare);
+  }
+};
+
+template<typename T>
+struct GreaterThan
+{
+  T compare;
+
+  __device__ __forceinline__
+  GreaterThan(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a > compare);
+  }
+};
+
+template<typename T>
+struct LessThanEq
+{
+  T compare;
+
+  __device__ __forceinline__
+  LessThanEq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a <= compare);
+  }
+};
+
+template<typename T>
+struct GreaterThanEq
+{
+  T compare;
+
+  __device__ __forceinline__
+  GreaterThanEq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a >= compare);
+  }
+};
+
+template<typename T>
+struct Eq
+{
+  T compare;
+
+  __device__ __forceinline__
+  Eq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a == compare);
+  }
+};
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThan<T> select_op(compare);
+  BlockPred<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThan<T> select_op(compare);
+  BlockPredAnd<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredGT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThan<T> select_op(compare);
+  BlockPred<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndGT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThan<T> select_op(compare);
+  BlockPredAnd<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThanEq<T> select_op(compare);
+  BlockPred<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThanEq<T> select_op(compare);
+  BlockPredAnd<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredGTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThanEq<T> select_op(compare);
+  BlockPred<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndGTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThanEq<T> select_op(compare);
+  BlockPredAnd<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPred<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPredAnd<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPredOr<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
diff --git a/crystal/src/crystal/reduce.cuh b/crystal/src/crystal/reduce.cuh
new file mode 100644
index 0000000..ff0baca
--- /dev/null
+++ b/crystal/src/crystal/reduce.cuh
@@ -0,0 +1,53 @@
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(
+    T  item,
+    T* shared
+    ) {
+  __syncthreads();
+
+  T val = item;
+  const int warp_size = 32;
+  int lane = threadIdx.x % warp_size;
+  int wid = threadIdx.x / warp_size;
+
+  // Calculate sum across warp
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(0xffffffff, val, offset);
+  }
+
+  // Store sum in buffer
+  if (lane == 0) {
+    shared[wid] = val;
+  }
+
+  __syncthreads();
+
+  // Load the sums into the first warp
+  val = (threadIdx.x < blockDim.x / warp_size) ? shared[lane] : 0;
+
+  // Calculate sum of sums
+  if (wid == 0) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+      val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+  }
+
+  return val;
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(
+    T (&items)[ITEMS_PER_THREAD],
+    T* shared
+    ) {
+  T thread_sum = 0;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    thread_sum += items[ITEM];
+  }
+
+  return BlockSum(thread_sum, shared);
+}
diff --git a/crystal/src/crystal/store.cuh b/crystal/src/crystal/store.cuh
new file mode 100644
index 0000000..a99d5b4
--- /dev/null
+++ b/crystal/src/crystal/store.cuh
@@ -0,0 +1,98 @@
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* out,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = out;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
+
diff --git a/crystal/src/ops/join.cu b/crystal/src/ops/join.cu
new file mode 100644
index 0000000..4944191
--- /dev/null
+++ b/crystal/src/ops/join.cu
@@ -0,0 +1,220 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "utils/generator.h"
+#include "utils/gpu_utils.h"
+
+using namespace std;
+
+#define DEBUG 1
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_kernel(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_kernel(int *fact_fkey, int *fact_val, int num_tuples, 
+    int *hash_table, int num_slots, unsigned long long *res) {
+  // Load a tile striped across threads
+  int selection_flags[ITEMS_PER_THREAD];
+  int keys[ITEMS_PER_THREAD];
+  int vals[ITEMS_PER_THREAD];
+  int join_vals[ITEMS_PER_THREAD];
+
+  unsigned long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples+ TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(fact_fkey + tile_offset, keys, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(fact_val + tile_offset, vals, num_tile_items);
+
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, join_vals, selection_flags,
+      hash_table, num_slots, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += vals[ITEM] * join_vals[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(res, aggregate);
+  }
+}
+
+struct TimeKeeper {
+  float time_build;
+  float time_probe;
+  float time_extra;
+  float time_total;
+};
+
+TimeKeeper hashJoin(int* d_dim_key, int* d_dim_val, int* d_fact_fkey, int* d_fact_val, int num_dim, int num_fact, cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  int* hash_table = NULL;
+  unsigned long long* res;
+  int num_slots = num_dim;
+  float time_build, time_probe, time_memset, time_memset2;
+
+  ALLOCATE(hash_table, sizeof(int) * 2 * num_dim);
+  ALLOCATE(res, sizeof(long long));
+
+  TIME_FUNC(cudaMemset(hash_table, 0, num_slots * sizeof(int) * 2), time_memset);
+  TIME_FUNC(cudaMemset(res, 0, sizeof(long long)), time_memset2);
+
+  int tile_items = 128*4;
+
+  TIME_FUNC((build_kernel<128, 4><<<(num_dim + tile_items - 1)/tile_items, 128>>>(d_dim_key, d_dim_val, num_dim, hash_table, num_slots)), time_build);
+  TIME_FUNC((probe_kernel<128, 4><<<(num_fact + tile_items - 1)/tile_items, 128>>>(d_fact_fkey, d_fact_val, num_fact, hash_table, num_slots, res)), time_probe);
+
+#if DEBUG
+  cout << "{" << "\"time_memset\":" << time_memset
+      << ",\"time_build\"" << time_build
+      << ",\"time_probe\":" << time_probe << "}" << endl;
+#endif
+
+  CLEANUP(hash_table);
+  CLEANUP(res);
+
+  TimeKeeper t = {time_build, time_probe, time_memset, time_build + time_probe + time_memset};
+  return t;
+}
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+#define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec))
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+int main(int argc, char** argv)
+{
+  int num_fact           = 256 * 1<<20;
+  int num_dim            = 16 * 1<<20;
+  int num_trials         = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("n", num_fact);
+  args.GetCmdLineArgument("d", num_dim);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+        "[--n=<num fact>] "
+        "[--d=<num dim>] "
+        "[--t=<num trials>] "
+        "[--device=<device-id>] "
+        "[--v] "
+        "\n", argv[0]);
+    exit(0);
+  }
+
+  int log2 = 0;
+  int num_dim_dup = num_dim >> 1;
+  while (num_dim_dup) {
+    num_dim_dup >>= 1;
+    log2 += 1;
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate problem device arrays
+  int *d_dim_key = NULL;
+  int *d_dim_val = NULL;
+  int *d_fact_fkey = NULL;
+  int *d_fact_val = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dim_key, sizeof(int) * num_dim));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dim_val, sizeof(int) * num_dim));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_fact_fkey, sizeof(int) * num_fact));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_fact_val, sizeof(int) * num_fact));
+
+  int *h_dim_key = NULL;
+  int *h_dim_val = NULL;
+  int *h_fact_fkey = NULL;
+  int *h_fact_val = NULL;
+
+  create_relation_pk(h_dim_key, h_dim_val, num_dim);
+  create_relation_fk(h_fact_fkey, h_fact_val, num_fact, num_dim);
+
+  CubDebugExit(cudaMemcpy(d_dim_key, h_dim_key, sizeof(int) * num_dim, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_dim_val, h_dim_val, sizeof(int) * num_dim, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_fact_fkey, h_fact_fkey, sizeof(int) * num_fact, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_fact_val, h_fact_val, sizeof(int) * num_fact, cudaMemcpyHostToDevice));
+
+  for (int j = 0; j < num_trials; j++) {
+    TimeKeeper t = hashJoin(d_dim_key, d_dim_val, d_fact_fkey, d_fact_val, num_dim, num_fact, g_allocator);
+    cout<< "{"
+        << "\"num_dim\":" << num_dim
+        << ",\"num_fact\":" << num_fact
+        << ",\"radix\":" << 0
+        << ",\"time_partition_build\":" << 0
+        << ",\"time_partition_probe\":" << 0
+        << ",\"time_partition_total\":" << 0
+        << ",\"time_build\":" << t.time_build
+        << ",\"time_probe\":" << t.time_probe
+        << ",\"time_extra\":" << t.time_extra
+        << ",\"time_join_total\":" << t.time_total
+        << "}" << endl;
+  }
+
+  CLEANUP(d_dim_key);
+  CLEANUP(d_dim_val);
+  CLEANUP(d_fact_fkey);
+  CLEANUP(d_fact_val);
+
+  return 0;
+}
+
diff --git a/crystal/src/ops/project.cu b/crystal/src/ops/project.cu
new file mode 100644
index 0000000..3340db6
--- /dev/null
+++ b/crystal/src/ops/project.cu
@@ -0,0 +1,176 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+#include <cmath>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "utils/gpu_utils.h"
+
+using namespace std;
+
+
+//---------------------------------------------------------------------
+// Implements Projection Operator
+// There are two variants: dot-product and sigmoid
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void project(float* in1, float* in2, float* out, int num_items)
+{
+  float items[ITEMS_PER_THREAD];
+  float items2[ITEMS_PER_THREAD];
+  float res[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_items + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_items - tile_offset;
+  }
+
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in1 + tile_offset, items, num_tile_items);
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in2 + tile_offset, items2, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
+      res[ITEM] = 2*items[ITEM] + 3*items2[ITEM];
+    }
+  }
+
+  BlockStore<float, BLOCK_THREADS, ITEMS_PER_THREAD>(out + tile_offset, res, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void projectSigmoid(float* in1, float* in2, float* out, int num_items)
+{
+  float items[ITEMS_PER_THREAD];
+  float items2[ITEMS_PER_THREAD];
+  float res[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_items + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_items - tile_offset;
+  }
+
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in1 + tile_offset, items, num_tile_items);
+  BlockLoad<float, BLOCK_THREADS, ITEMS_PER_THREAD>(in2 + tile_offset, items2, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
+      res[ITEM] = 1.0f / (1.0f + expf(-2*items[ITEM] -3*items2[ITEM]));
+    }
+  }
+
+  BlockStore<float, BLOCK_THREADS, ITEMS_PER_THREAD>(out + tile_offset, res, num_tile_items);
+}
+
+
+float projectGPU(float* in1, float* in2, float* out, int num_items) {
+  SETUP_TIMING();
+
+  float time_proj;
+  int tile_items = 128*4;
+  int num_blocks = (num_items + tile_items - 1)/tile_items;
+  TIME_FUNC((project<128,4><<<num_blocks, 128>>>(in1, in2, out, num_items)), time_proj);
+
+  return time_proj;
+}
+
+float projectSigmoidGPU(float* in1, float* in2, float* out, int num_items) {
+  SETUP_TIMING();
+
+  float time_proj;
+  int tile_items = 128*4;
+  int num_blocks = (num_items + tile_items - 1)/tile_items;
+  TIME_FUNC((projectSigmoid<128,4><<<num_blocks, 128>>>(in1, in2, out, num_items)), time_proj);
+
+  return time_proj;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items           = 1<<28;
+  int num_trials          = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--n=<input items>] "
+          "[--t=<num trials>] "
+          "[--device=<device-id>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate problem device arrays
+  float *d_in1 = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in1, sizeof(float) * num_items));
+
+  float *d_in2 = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in2, sizeof(float) * num_items));
+
+  float  *d_out = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(float) * num_items));
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  curandGenerator_t generator;
+  int seed = 0;
+  curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
+  curandSetPseudoRandomGeneratorSeed(generator,seed);
+  curandGenerateUniform(generator, d_in1, num_items);
+  curandGenerateUniform(generator, d_in2, num_items);
+
+  float time_proj_gpu;
+  float time_proj_sigmoid_gpu;
+
+  for (int t = 0; t < num_trials; t++) {
+    time_proj_gpu = projectGPU(d_in1, d_in2, d_out, num_items);
+    time_proj_sigmoid_gpu = projectSigmoidGPU(d_in1, d_in2, d_out, num_items);
+
+    cout<< "{"
+        << "\"time_proj_gpu\":" << time_proj_gpu
+        << ",\"time_proj_sigmoid_gpu\":" << time_proj_sigmoid_gpu
+        << "}" << endl;
+  }
+
+  // Cleanup
+  if (d_in1) CubDebugExit(g_allocator.DeviceFree(d_in1));
+  if (d_in2) CubDebugExit(g_allocator.DeviceFree(d_in2));
+  if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+
+  return 0;
+}
+
diff --git a/crystal/src/ops/utils/generator.h b/crystal/src/ops/utils/generator.h
new file mode 100644
index 0000000..a24039f
--- /dev/null
+++ b/crystal/src/ops/utils/generator.h
@@ -0,0 +1,377 @@
+#pragma once
+
+#include <iostream>
+#include <cstdio>              /* perror */
+#include <cstdlib>             /* posix_memalign */
+#include <immintrin.h>
+#include <thread>
+using namespace std;
+
+#define RAND_RANGE(N) ((double)rand() / ((double)RAND_MAX + 1) * (N))
+#define RANDR_RANGE(N) ((double)rand_r(&seed) / ((double)RAND_MAX + 1) * (N))
+static int seeded = 0;
+
+/** Check wheter seeded, if not seed the generator with current time */
+static void
+check_seed()
+{
+    if(!seeded) {
+        srand(0);
+        seeded = 1;
+    }
+}
+
+/**
+ * Shuffle tuples of the relation using Knuth shuffle.
+ *
+ * @param relation
+ */
+void
+knuth_shuffle(int* arr, int num_tuples)
+{
+    int i;
+    for (i = num_tuples - 1; i > 0; i--) {
+        int  j              = RAND_RANGE(i);
+        int tmp             = arr[i];
+        arr[i] = arr[j];
+        arr[j] = tmp;
+    }
+}
+
+
+/**
+ * Generate unique tuple IDs with Knuth shuffling
+ * relation must have been allocated
+ */
+void
+random_unique_gen(int*& arr, int num_tuples)
+{
+  int i;
+
+  for (i = 0; i < num_tuples; i++) {
+    arr[i] = (i+1);
+  }
+
+  /* randomly shuffle elements */
+  knuth_shuffle(arr, num_tuples);
+}
+
+void
+dummy_initialize(int*& arr, int num_tuples) {
+    for (int i = 0; i < num_tuples; i++) {
+        arr[i] = i;
+    }
+}
+
+int
+create_relation_pk(int*& keys, int*& vals, int num_tuples)
+{
+  check_seed();
+
+  keys = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+  vals = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+
+  if (!keys || !vals) {
+      perror("out of memory");
+      return -1;
+  }
+
+  random_unique_gen(keys, num_tuples);
+  dummy_initialize(vals, num_tuples);
+
+  return 0;
+}
+
+int create_relation_fk(int*& keys, int*& vals, int num_tuples, const int maxid)
+{
+  int i, iters, remainder;
+
+  check_seed();
+  keys = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+  vals = (int*)_mm_malloc(num_tuples * sizeof(int), 256);
+
+  if (!keys || !vals) {
+    perror("out of memory");
+    return -1;
+  }
+
+  // alternative generation method
+  iters = num_tuples / maxid;
+  for (i = 0; i < iters; i++) {
+    int* tuples = keys + maxid * i;
+    random_unique_gen(tuples, maxid);
+  }
+
+  // if num_tuples is not an exact multiple of maxid
+  remainder = num_tuples % maxid;
+  if (remainder > 0) {
+    int* tuples = keys + maxid * iters;
+    random_unique_gen(tuples, remainder);
+  }
+
+  dummy_initialize(vals, num_tuples);
+  return 0;
+}
+
+/*
+typedef struct rand_state_64 {
+  uint64_t num[313];
+  size_t index;
+} rand64_t;
+
+rand64_t *rand64_init(uint64_t seed)
+{
+  rand64_t *state = malloc(sizeof(rand64_t));
+  uint64_t *n = state->num;
+  size_t i;
+  n[0] = seed;
+  for (i = 0 ; i != 311 ; ++i)
+    n[i + 1] = 6364136223846793005ull *
+               (n[i]  (n[i] >> 62)) + i + 1;
+  state->index = 312;
+  return state;
+}
+
+uint64_t rand64_next(rand64_t *state)
+{
+  uint64_t x, *n = state->num;
+  if (state->index == 312) {
+    size_t i = 0;
+    do {
+      x = n[i] & 0xffffffff80000000ull;
+      x |= n[i + 1] & 0x7fffffffull;
+      n[i] = n[i + 156]  (x >> 1);
+      n[i] = 0xb5026f5aa96619e9ull & -(x & 1);
+    } while (++i != 156);
+    n[312] = n[0];
+    do {
+      x = n[i] & 0xffffffff80000000ull;
+      x |= n[i + 1] & 0x7fffffffull;
+      n[i] = n[i - 156]  (x >> 1);
+      n[i] = 0xb5026f5aa96619e9ull & -(x & 1);
+    } while (++i != 312);
+    state->index = 0;
+  }
+  x = n[state->index++];
+  x = (x >> 29) & 0x5555555555555555ull;
+  x = (x << 17) & 0x71d67fffeda60000ull;
+  x = (x << 37) & 0xfff7eee000000000ull;
+  x = (x >> 43);
+  return x;
+}
+
+typedef struct rand_state_32 {
+  uint32_t num[625];
+  size_t index;
+} rand32_t;
+
+rand32_t *rand32_init(uint32_t seed)
+{
+  rand32_t *state = malloc(sizeof(rand32_t));
+  uint32_t *n = state->num;
+  size_t i;
+  n[0] = seed;
+  for (i = 0 ; i != 623 ; ++i)
+    n[i + 1] = 0x6c078965 * (n[i]  (n[i] >> 30));
+  state->index = 624;
+  return state;
+}
+
+uint32_t rand32_next(rand32_t *state)
+{
+  uint32_t y, *n = state->num;
+  if (state->index == 624) {
+    size_t i = 0;
+    do {
+      y = n[i] & 0x80000000;
+      y += n[i + 1] & 0x7fffffff;
+      n[i] = n[i + 397]  (y >> 1);
+      n[i] = 0x9908b0df & -(y & 1);
+    } while (++i != 227);
+    n[624] = n[0];
+    do {
+      y = n[i] & 0x80000000;
+      y += n[i + 1] & 0x7fffffff;
+      n[i] = n[i - 227]  (y >> 1);
+      n[i] = 0x9908b0df & -(y & 1);
+    } while (++i != 624);
+    state->index = 0;
+  }
+  y = n[state->index++];
+  y = (y >> 11);
+  y = (y << 7) & 0x9d2c5680;
+  y = (y << 15) & 0xefc60000;
+  y = (y >> 18);
+  return y;
+}
+
+static int hardware_threads(void)
+{
+  char name[64];
+  struct stat st;
+  int threads = -1;
+  do {
+    sprintf(name, "/sys/devices/system/cpu/cpu%d", ++threads);
+  } while (stat(name, &st) == 0);
+  return threads;
+}
+
+static void *mamalloc(size_t size)
+{
+  void *p = NULL;
+  return posix_memalign(&p, 64, size) ? NULL : p;
+}
+
+typedef struct {
+  pthread_t id;
+  int seed;
+  int thread;
+  int threads;
+  uint32_t hash_factor;
+  uint32_t invalid_key;
+  uint32_t *inner;
+  uint32_t *outer;
+  volatile uint32_t *table;
+  size_t inner_size;
+  size_t outer_size;
+  size_t table_size;
+  size_t join_size;
+  double selectivity;
+  pthread_barrier_t *barrier;
+} info_t;
+
+static void *run(void *arg)
+{
+  info_t *d = (info_t*) arg;
+  assert(pthread_equal(pthread_self(), d->id));
+  int thread = d->thread;
+  int threads = d->threads;
+  uint32_t hash_factor = d->hash_factor;
+  uint32_t invalid_key = d->invalid_key;
+  uint32_t *inner = d->inner;
+  uint32_t *outer = d->outer;
+  volatile uint32_t *table = d->table;
+  size_t i, o, t, h;
+  size_t inner_size = d->inner_size;
+  size_t outer_size = d->outer_size;
+  size_t table_size = d->table_size;
+  size_t inner_beg = (inner_size / threads) *  thread;
+  size_t inner_end = (inner_size / threads) * (thread + 1);
+  size_t outer_beg = (outer_size / threads) *  thread;
+  size_t outer_end = (outer_size / threads) * (thread + 1);
+  size_t table_beg = (table_size / threads) *  thread;
+  size_t table_end = (table_size / threads) * (thread + 1);
+  if (thread + 1 == threads) {
+    inner_end = inner_size;
+    outer_end = outer_size;
+    table_end = table_size;
+  }
+  for (t = table_beg ; t != table_end ; ++t)
+    table[t] = invalid_key;
+  pthread_barrier_wait(&d->barrier[0]);
+  rand32_t *gen = rand32_init(d->seed);
+  for (i = inner_beg ; i != inner_end ; ++i) {
+    int new_key_inserted = 0;
+    uint32_t key;
+    do {
+      do {
+        key = rand32_next(gen);
+      } while (key == invalid_key);
+      h = (uint32_t) (key * hash_factor);
+      h = (h * table_size) >> 32;
+      for (;;) {
+        if (table[h] == invalid_key &&
+            __sync_bool_compare_and_swap(&table[h], invalid_key, key)) {
+            new_key_inserted = 1;
+          break;
+        }
+        if (table[h] == key) break;
+        if (++h == table_size) h = 0;
+      }
+    } while (new_key_inserted == 0);
+    inner[i] = key;
+  }
+  pthread_barrier_wait(&d->barrier[1]);
+  size_t join_size = 0;
+  uint32_t limit = ~0;
+  limit *= d->selectivity;
+  for (o = outer_beg ; o != outer_end ; ++o) {
+    uint32_t key;
+    if (rand32_next(gen) <= limit) {
+      i = rand32_next(gen);
+      i = (i * inner_size) >> 32;
+      key = inner[i];
+      join_size++;
+    } else do {
+      do {
+        key = rand32_next(gen);
+      } while (key == invalid_key);
+      h = (uint32_t) (key * hash_factor);
+      h = (h * table_size) >> 32;
+      while (table[h] != invalid_key) {
+        if (table[h] == key) break;
+        if (++h == table_size) h = 0;
+      }
+    } while (table[h] == key);
+    outer[o] = key;
+  }
+  free(gen);
+  d->join_size = join_size;
+  pthread_exit(NULL);
+}
+
+size_t inner_outer(size_t inner_size, size_t outer_size, double selectivity,
+                   uint32_t **inner_p, uint32_t **outer_p)
+{
+  srand(time(NULL));
+  int t, threads = hardware_threads();
+  // input arguments
+  assert(inner_size <= 1000 * 1000 * 1000);
+  assert(selectivity >= 0.0 && selectivity <= 1.0);
+  // tables
+  uint32_t *inner = mamalloc((inner_size + 1) * sizeof(uint32_t));
+  uint32_t *outer = mamalloc(outer_size * sizeof(uint32_t));
+  size_t table_size = inner_size / 0.7;
+  uint32_t *table = malloc(table_size * sizeof(uint32_t));
+  // constants
+  uint32_t hash_factor = (rand() << 1) | 1;
+  uint32_t invalid_key = rand() * rand();
+  // barriers
+  int b, barriers = 2;
+  pthread_barrier_t barrier[barriers];
+  for (b = 0 ; b != barriers ; ++b)
+    pthread_barrier_init(&barrier[b], NULL, threads);
+  // run threads
+  info_t info[threads];
+  for (t = 0 ; t != threads ; ++t) {
+    info[t].seed = rand();
+    info[t].thread = t;
+    info[t].threads = threads;
+    info[t].hash_factor = hash_factor;
+    info[t].invalid_key = invalid_key;
+    info[t].selectivity = selectivity;
+    info[t].inner = inner;
+    info[t].outer = outer;
+    info[t].table = table;
+    info[t].inner_size = inner_size;
+    info[t].outer_size = outer_size;
+    info[t].table_size = table_size;
+    info[t].barrier = barrier;
+    pthread_create(&info[t].id, NULL, run, (void*) &info[t]);
+  }
+  size_t join_size = 0;
+  for (t = 0 ; t != threads ; ++t) {
+    pthread_join(info[t].id, NULL);
+    join_size += info[t].join_size;
+  }
+  // cleanup
+  for (b = 0 ; b != barriers ; ++b)
+    pthread_barrier_destroy(&barrier[b]);
+  free(table);
+  // pass output
+  inner[inner_size] = invalid_key;
+  *inner_p = inner;
+  *outer_p = outer;
+  return join_size;
+}
+*/
diff --git a/crystal/src/ops/utils/gpu_utils.h b/crystal/src/ops/utils/gpu_utils.h
new file mode 100644
index 0000000..93ad8af
--- /dev/null
+++ b/crystal/src/ops/utils/gpu_utils.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#define SETUP_TIMING() cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
+
+#define TIME_FUNC(f,t) { \
+    cudaEventRecord(start, 0); \
+    f; \
+    cudaEventRecord(stop, 0); \
+    cudaEventSynchronize(stop); \
+    cudaEventElapsedTime(&t, start,stop); \
+}
+
+#define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec))
+
+#define ALLOCATE(vec,size) CubDebugExit(g_allocator.DeviceAllocate((void**)&vec, size))
+
+template<typename T>
+T* loadToGPU(T* src, int numEntries, cub::CachingDeviceAllocator& g_allocator) {
+  T* dest;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&dest, sizeof(T) * numEntries));
+  CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * numEntries, cudaMemcpyHostToDevice));
+  return dest;
+}
+
+#define TILE_SIZE (BLOCK_THREADS * ITEMS_PER_THREAD)
+
+#define CHECK_ERROR() { \
+  cudaDeviceSynchronize(); \
+  cudaError_t error = cudaGetLastError(); \
+  if(error != cudaSuccess) \
+  { \
+    printf("CUDA error: %s\n", cudaGetErrorString(error)); \
+    exit(-1); \
+  } \
+}
diff --git a/crystal/src/ssb/gpu_utils.h b/crystal/src/ssb/gpu_utils.h
new file mode 100644
index 0000000..93ad8af
--- /dev/null
+++ b/crystal/src/ssb/gpu_utils.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#define SETUP_TIMING() cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
+
+#define TIME_FUNC(f,t) { \
+    cudaEventRecord(start, 0); \
+    f; \
+    cudaEventRecord(stop, 0); \
+    cudaEventSynchronize(stop); \
+    cudaEventElapsedTime(&t, start,stop); \
+}
+
+#define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec))
+
+#define ALLOCATE(vec,size) CubDebugExit(g_allocator.DeviceAllocate((void**)&vec, size))
+
+template<typename T>
+T* loadToGPU(T* src, int numEntries, cub::CachingDeviceAllocator& g_allocator) {
+  T* dest;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&dest, sizeof(T) * numEntries));
+  CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * numEntries, cudaMemcpyHostToDevice));
+  return dest;
+}
+
+#define TILE_SIZE (BLOCK_THREADS * ITEMS_PER_THREAD)
+
+#define CHECK_ERROR() { \
+  cudaDeviceSynchronize(); \
+  cudaError_t error = cudaGetLastError(); \
+  if(error != cudaSuccess) \
+  { \
+    printf("CUDA error: %s\n", cudaGetErrorString(error)); \
+    exit(-1); \
+  } \
+}
diff --git a/crystal/src/ssb/q11.cu b/crystal/src/ssb/q11.cu
new file mode 100644
index 0000000..ebec888
--- /dev/null
+++ b/crystal/src/ssb/q11.cu
@@ -0,0 +1,168 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice,
+    int lo_num_entries, unsigned long long* revenue) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockPredGT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19930000, selection_flags, num_tile_items);
+  BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19940000, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_quantity + tile_offset, items, num_tile_items);
+  BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 25, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_discount + tile_offset, items, num_tile_items);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_extendedprice + tile_offset, items2, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+float runQuery(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice, 
+    int lo_num_entries, cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  unsigned long long* d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128*4;
+  int num_blocks = (lo_num_entries + tile_items - 1)/tile_items;
+  QueryKernel<128,4><<<num_blocks, 128>>>(lo_orderdate, 
+          lo_discount, lo_quantity, lo_extendedprice, lo_num_entries, d_sum);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  unsigned long long revenue;
+  CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Revenue: " << revenue << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  CLEANUP(d_sum);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 1; // FLS_CHG
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN);
+  int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN);
+  int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+  cout << "LO_LEN " << LO_LEN << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator);
+  int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator);
+  int *d_lo_extendedprice = loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator);
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, LO_LEN, g_allocator);
+    cout<< "{"
+        << "\"query\":11" 
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q12.cu b/crystal/src/ssb/q12.cu
new file mode 100644
index 0000000..b3a16d1
--- /dev/null
+++ b/crystal/src/ssb/q12.cu
@@ -0,0 +1,167 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void DeviceSelectIf(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice,
+    int lo_num_entries, unsigned long long* revenue) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19940101, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19940131, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_quantity + tile_offset, items, num_tile_items);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 26, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 35, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_discount + tile_offset, items, num_tile_items);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 4, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 6, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_extendedprice + tile_offset, items2, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+float runQuery(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice, 
+    int lo_num_entries, cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  unsigned long long* d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128*4;
+  DeviceSelectIf<128,4><<<(lo_num_entries + tile_items - 1)/tile_items, 128>>>(lo_orderdate, 
+          lo_discount, lo_quantity, lo_extendedprice, lo_num_entries, d_sum);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  unsigned long long revenue;
+  CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Revenue: " << revenue << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  CLEANUP(d_sum);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN);
+  int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN);
+  int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator);
+  int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator);
+  int *d_lo_extendedprice = loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator);
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, LO_LEN, g_allocator);
+    cout<< "{"
+        << "\"query\":12" 
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q13.cu b/crystal/src/ssb/q13.cu
new file mode 100644
index 0000000..cc73bc9
--- /dev/null
+++ b/crystal/src/ssb/q13.cu
@@ -0,0 +1,167 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void DeviceSelectIf(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice,
+    int lo_num_entries, unsigned long long* revenue) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_num_entries - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19940204, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19940210, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_quantity + tile_offset, items, num_tile_items);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 26, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 35, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_discount + tile_offset, items, num_tile_items);
+  BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 5, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 7, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_extendedprice + tile_offset, items2, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  static __shared__ long long buffer[32];
+  unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+float runQuery(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice, 
+    int lo_num_entries, cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  unsigned long long* d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+  // Run
+  int tile_items = 128*4;
+  TIME_FUNC((DeviceSelectIf<128,4><<<(lo_num_entries + tile_items - 1)/tile_items, 128>>>(lo_orderdate, 
+          lo_discount, lo_quantity, lo_extendedprice, lo_num_entries, d_sum)), time_query);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  unsigned long long revenue;
+  CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Revenue: " << revenue << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  CLEANUP(d_sum);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN);
+  int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN);
+  int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator);
+  int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator);
+  int *d_lo_extendedprice = loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator);
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, LO_LEN, g_allocator);
+    cout<< "{"
+        << "\"query\":13" 
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q21.cu b/crystal/src/ssb/q21.cu
new file mode 100644
index 0000000..ac8e560
--- /dev/null
+++ b/crystal/src/ssb/q21.cu
@@ -0,0 +1,286 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* ht_s, int s_len,
+    int* ht_p, int p_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_partkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, brand, selection_flags,
+      ht_p, p_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (brand[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * (5*5*40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *filter_col, int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, val_min, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* p_partkey, int* p_brand1, int* p_category, int p_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *s_suppkey, int* s_region, int s_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128*4;
+
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128,4><<<(p_len + tile_items - 1)/tile_items, 128>>>(p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(
+      d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * (5 * 5 * 40));
+  int res_array_size = res_size * 4;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate,
+          lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_p, p_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << "{" <<h_res[4*i] << "," << h_res[4*i + 1] << "," << reinterpret_cast<unsigned long long*>(&h_res[4*i + 2])[0] << "},//" << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  CLEANUP(res);
+  CLEANUP(ht_d);
+  CLEANUP(ht_p);
+  CLEANUP(ht_s);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+  int *h_p_category = loadColumn<int>("p_category", P_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_brand1  = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+  int *d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_p_partkey, d_p_brand1, d_p_category, P_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_s_suppkey, d_s_region, S_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":21"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q22.cu b/crystal/src/ssb/q22.cu
new file mode 100644
index 0000000..fb9dbb7
--- /dev/null
+++ b/crystal/src/ssb/q22.cu
@@ -0,0 +1,286 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* ht_s, int s_len,
+    int* ht_p, int p_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_partkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, brand, selection_flags,
+      ht_p, p_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (brand[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * (5*5*40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 260, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 267, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, val_min, num_tile_items);
+}
+
+
+float runQuery(int* lo_orderdate, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* p_partkey, int* p_brand1, int p_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *s_suppkey, int* s_region, int s_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128,4><<<(p_len + tile_items - 1)/tile_items, 128>>>(p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 1000);
+  int res_array_size = res_size * 4;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate,
+          lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_p, p_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << h_res[4*i] << " " << h_res[4*i + 1] << " " << reinterpret_cast<unsigned long long*>(&h_res[4*i + 2])[0]  << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  CLEANUP(ht_d);
+  CLEANUP(ht_p);
+  CLEANUP(ht_s);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_brand1  = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_p_partkey, d_p_brand1, P_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_s_suppkey, d_s_region, S_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":22"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q23.cu b/crystal/src/ssb/q23.cu
new file mode 100644
index 0000000..0d00972
--- /dev/null
+++ b/crystal/src/ssb/q23.cu
@@ -0,0 +1,279 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* ht_s, int s_len,
+    int* ht_p, int p_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a tile striped across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_partkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, brand, selection_flags,
+      ht_p, p_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (brand[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * (5*5*40));
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = brand[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 260, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, val_min, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* p_partkey, int* p_brand1, int p_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *s_suppkey, int* s_region, int s_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_p, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128,4><<<(p_len + tile_items - 1)/tile_items, 128>>>(p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 1000);
+  int res_array_size = res_size * 4;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate,
+          lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_p, p_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << h_res[4*i] << " " << h_res[4*i + 1] << " " << reinterpret_cast<unsigned long long*>(&h_res[4*i + 2])[0]  << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+  delete[] h_res;
+
+  return time_query;
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_brand1  = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_partkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_p_partkey, d_p_brand1, P_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_s_suppkey, d_s_region, S_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":23"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q31.cu b/crystal/src/ssb/q31.cu
new file mode 100644
index 0000000..22d7e5b
--- /dev/null
+++ b/crystal/src/ssb/q31.cu
@@ -0,0 +1,296 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* ht_s, int s_len,
+    int* ht_c, int c_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, s_nation, selection_flags,
+      ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_custkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, c_nation, selection_flags,
+      ht_c, c_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 25 * 7  + c_nation[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * 25 * 25);
+        res[hash * 6] = year[ITEM];
+        res[hash * 6 + 1] = c_nation[ITEM];
+        res[hash * 6 + 2] = s_nation[ITEM];
+        /*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+        atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 6 + 4]), (long long)(revenue[ITEM]));
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_s(int *filter_col, int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_c(int *filter_col, int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1992, selection_flags, num_tile_items);
+  BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags, 
+      hash_table, num_slots, 19920101, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *s_suppkey, int* s_region, int* s_nation, int s_len,
+    int *c_custkey, int* c_region, int* c_nation, int c_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128,4><<<(c_len + tile_items - 1)/tile_items, 128>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 25 * 25);
+  int res_array_size = res_size * 6;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate,
+          lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[6*i] != 0) {
+      cout << h_res[6*i] << " " << h_res[6*i + 1] << " " << h_res[6*i + 2] << " " << reinterpret_cast<unsigned long long*>(&h_res[6*i + 4])[0] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_nation = loadColumn<int>("c_nation", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+  int *d_c_nation = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_s_suppkey, d_s_region, d_s_nation, S_LEN,
+        d_c_custkey, d_c_region, d_c_nation, C_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":31"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q32.cu b/crystal/src/ssb/q32.cu
new file mode 100644
index 0000000..7bbe156
--- /dev/null
+++ b/crystal/src/ssb/q32.cu
@@ -0,0 +1,290 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* ht_s, int s_len,
+    int* ht_c, int c_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, s_nation, selection_flags,
+      ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_custkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, c_nation, selection_flags,
+      ht_c, c_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 250 * 7  + c_nation[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int *filter_col, int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1992, selection_flags, num_tile_items);
+  BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags, 
+      hash_table, num_slots, 19920101, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *s_suppkey, int* s_nation, int* s_city, int s_len,
+    int *c_custkey, int* c_nation, int* c_city, int c_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_nation, s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128,4><<<(c_len + tile_items - 1)/tile_items, 128>>>(c_nation, c_custkey, c_city, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate,
+          lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << h_res[4*i] << " " << h_res[4*i + 1] << " " << h_res[4*i + 2] << " " << h_res[4*i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_nation = loadColumn<int>("c_nation", C_LEN);
+  int *h_c_city = loadColumn<int>("c_city", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_nation = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+  int *d_c_city = loadToGPU<int>(h_c_city, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_s_suppkey, d_s_nation, d_s_city, S_LEN,
+        d_c_custkey, d_c_nation, d_c_city, C_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":32"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q33.cu b/crystal/src/ssb/q33.cu
new file mode 100644
index 0000000..e639f7f
--- /dev/null
+++ b/crystal/src/ssb/q33.cu
@@ -0,0 +1,291 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* ht_s, int s_len,
+    int* ht_c, int c_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, s_nation, selection_flags,
+      ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_custkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, c_nation, selection_flags,
+      ht_c, c_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 250 * 7  + c_nation[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_s(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_c(int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1992, selection_flags, num_tile_items);
+  BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1997, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags, 
+      hash_table, num_slots, 19920101, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *s_suppkey, int* s_city, int s_len,
+    int *c_custkey, int* c_city, int c_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128,4><<<(c_len + tile_items - 1)/tile_items, 128>>>(c_custkey, c_city, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate,
+          lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << h_res[4*i] << " " << h_res[4*i + 1] << " " << h_res[4*i + 2] << " " << h_res[4*i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_city = loadColumn<int>("c_city", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_city = loadToGPU<int>(h_c_city, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_s_suppkey, d_s_city, S_LEN,
+        d_c_custkey, d_c_city, C_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":33"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q34.cu b/crystal/src/ssb/q34.cu
new file mode 100644
index 0000000..1003dc0
--- /dev/null
+++ b/crystal/src/ssb/q34.cu
@@ -0,0 +1,316 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int* ht_s, int s_len,
+    int* ht_c, int c_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, s_nation, selection_flags,
+      ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_custkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, c_nation, selection_flags,
+      ht_c, c_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101,num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (s_nation[ITEM] * 250 * 7  + c_nation[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * 250 * 250);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        res[hash * 4 + 2] = s_nation[ITEM];
+        atomicAdd(&res[hash * 4 + 3], revenue[ITEM]);
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_s(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags,
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_c(int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 231, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 235, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags,
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_d(int* filter_col, int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 199712, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags,
+      hash_table, num_slots, 19920101, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int lo_len,
+    int *d_datekey, int* d_year, int *d_yearmonthnum, int d_len,
+    int *s_suppkey, int* s_city, int s_len,
+    int *c_custkey, int* c_city, int c_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128,4><<<(c_len + tile_items - 1)/tile_items, 128>>>(c_custkey, c_city, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_yearmonthnum, d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 250 * 250);
+  int res_array_size = res_size * 4;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  int* d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(int)));
+
+  cudaMemset(d_sum, 0, sizeof(int));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate,
+          lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << h_res[4*i] << " " << h_res[4*i + 1] << " " << h_res[4*i + 2] << " " << h_res[4*i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+int murmur(int k) {
+  int h = 0xcd2e2c20;
+  const int len = 4;
+  k *= 0xcc9e2d51;
+  k = (k << 15) | (k >> 17);
+  k *= 0x1b873593;
+  h ^= k;
+  h = (h << 13) | (h >> 19);
+  h = (h * 5) + 0xe6546b64;
+  h ^= len;
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+  return h;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+  int *h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_city = loadColumn<int>("c_city", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+  int *d_d_yearmonthnum = loadToGPU<int>(h_d_yearmonthnum, D_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_city = loadToGPU<int>(h_c_city, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_suppkey, d_lo_revenue, LO_LEN,
+        d_d_datekey, d_d_year, d_d_yearmonthnum, D_LEN,
+        d_s_suppkey, d_s_city, S_LEN,
+        d_c_custkey, d_c_city, C_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":34"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q41.cu b/crystal/src/ssb/q41.cu
new file mode 100644
index 0000000..93fcbe0
--- /dev/null
+++ b/crystal/src/ssb/q41.cu
@@ -0,0 +1,371 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_partkey, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int* lo_supplycost, int lo_len,
+    int* ht_p, int p_len,
+    int* ht_s, int s_len,
+    int* ht_c, int c_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int c_nation[ITEMS_PER_THREAD];
+  // int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_custkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, c_nation, selection_flags,
+      ht_c, c_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_partkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_supplycost + tile_offset, items, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        int hash = (c_nation[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * 25);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = c_nation[ITEM];
+        /*atomicAdd(&res[hash * 4 + 2], (1));*/
+        /*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+        atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int *filter_col, int *dim_key, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int *filter_col, int *dim_key, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_c(int* filter_col, int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags,
+      hash_table, num_slots, val_min, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_custkey, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int* lo_supplycost, int lo_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *p_partkey, int* p_mfgr, int p_len,
+    int *s_suppkey, int* s_region, int s_len,
+    int *c_custkey, int* c_region, int* c_nation, int c_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  // int* s_res = new int[s_len * 2];
+  // CubDebugExit(cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+  build_hashtable_c<128,4><<<(c_len + tile_items - 1)/tile_items, 128>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  // int* c_res = new int[c_len * 2];
+  // CubDebugExit(cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+  build_hashtable_p<128,4><<<(p_len + tile_items - 1)/tile_items, 128>>>(p_mfgr, p_partkey, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  // int* p_res = new int[p_len * 2];
+  // CubDebugExit(cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+#if 0
+  int *h_ht_s = new int[s_len * 2];
+  int *h_ht_c = new int[c_len * 2];
+  int *h_ht_p = new int[p_len * 2];
+  int *h_ht_d = new int[d_val_len * 2];
+
+  int num_s = 0 , num_c = 0, num_d = 0, num_p = 0;
+
+  CubDebugExit(cudaMemcpy(h_ht_s, ht_s, 2 * s_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<s_len; i++) if (h_ht_s[i*2] != 0) num_s += 1;
+
+  cout << "Num Matched" << " " << num_s << " " << s_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_d, ht_d, 2 * d_val_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<d_val_len; i++) if (h_ht_d[i*2] != 0) num_d += 1;
+
+  cout << "Num Matched" << " " << num_d << " " << d_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_c, ht_c, 2 * c_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<c_len; i++) if (h_ht_c[i*2] != 0) num_c += 1;
+
+  cout << "Num Matched" << " " << num_c << " " << c_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_p, ht_p, 2 * p_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<p_len; i++) if (h_ht_p[i*2] != 0) num_p += 1;
+
+  cout << "Num Matched" << " " << num_p << " " << p_len << endl;
+#endif
+
+  int *res;
+  int res_size = ((1998-1992+1) * 25);
+  int ht_entries = 4; // int,int,long long
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate, lo_partkey,
+          lo_custkey, lo_suppkey, lo_revenue, lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << h_res[4*i] << " " << h_res[4*i + 1] << " " << reinterpret_cast<unsigned long long*>(&h_res[4*i + 2])[0]  << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+  int *h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_mfgr = loadColumn<int>("p_mfgr", P_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+  int *h_c_nation = loadColumn<int>("c_nation", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+  int *d_lo_supplycost = loadToGPU<int>(h_lo_supplycost, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_mfgr = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+  int *d_c_nation = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey, d_lo_revenue, d_lo_supplycost, LO_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_p_partkey, d_p_mfgr, P_LEN,
+        d_s_suppkey, d_s_region, S_LEN,
+        d_c_custkey, d_c_region, d_c_nation, C_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":41"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q42.cu b/crystal/src/ssb/q42.cu
new file mode 100644
index 0000000..ce8f956
--- /dev/null
+++ b/crystal/src/ssb/q42.cu
@@ -0,0 +1,346 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_partkey, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int* lo_supplycost, int lo_len,
+    int* ht_p, int p_len,
+    int* ht_s, int s_len,
+    int* ht_c, int c_len,
+    int* ht_d, int d_len,
+    int* res) {
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int category[ITEMS_PER_THREAD];
+  int s_nation[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, s_nation, selection_flags,
+      ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_custkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_c, c_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_partkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, category, selection_flags,
+      ht_p, p_len, num_tile_items);
+
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_supplycost + tile_offset, items, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        /*int hash = (category[ITEM] * 7 * 25 + s_nation[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * 25 * 55);*/
+        int hash = ((year[ITEM] - 1992) * 25 * 25 + s_nation[ITEM] * 25 + category[ITEM]) % ((1998-1992+1) * 25 * 25);
+        res[hash * 6] = year[ITEM];
+        res[hash * 6 + 1] = s_nation[ITEM];
+        res[hash * 6 + 2] = category[ITEM];
+        atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 6 + 4]), (long long)(revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_c(int *filter_col, int *dim_key, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_p(int *filter_col, int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_s(int* filter_col, int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1997, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1998, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags,
+      hash_table, num_slots, val_min, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_custkey, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int* lo_supplycost, int lo_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *p_partkey, int* p_mfgr, int* p_category, int p_len,
+    int *s_suppkey, int* s_region, int* s_nation, int s_len,
+    int *c_custkey, int* c_region, int c_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128,4><<<(c_len + tile_items - 1)/tile_items, 128>>>(c_region, c_custkey, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128,4><<<(p_len + tile_items - 1)/tile_items, 128>>>(p_mfgr, p_partkey, p_category, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 25 * 25);
+  int ht_entries = 6;
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate, lo_partkey,
+          lo_custkey, lo_suppkey, lo_revenue, lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Result:" << endl;
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[6*i] != 0) {
+      cout << h_res[6*i] << " " << h_res[6*i + 1] << " " << h_res[6*i + 2] << " " << reinterpret_cast<unsigned long long*>(&h_res[6*i + 4])[0] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+  int *h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_region = loadColumn<int>("s_region", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_mfgr = loadColumn<int>("p_mfgr", P_LEN);
+  int *h_p_category = loadColumn<int>("p_category", P_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+  int *d_lo_supplycost = loadToGPU<int>(h_lo_supplycost, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_mfgr = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+  int *d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_region = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey, d_lo_revenue, d_lo_supplycost, LO_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_p_partkey, d_p_mfgr, d_p_category, P_LEN,
+        d_s_suppkey, d_s_region, d_s_nation, S_LEN,
+        d_c_custkey, d_c_region, C_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":42"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/q43.cu b/crystal/src/ssb/q43.cu
new file mode 100644
index 0000000..6b2367d
--- /dev/null
+++ b/crystal/src/ssb/q43.cu
@@ -0,0 +1,343 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include "cub/test/test_util.h"
+
+#include "crystal.cuh"
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+
+using namespace std;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+cub::CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate, int* lo_partkey, int* lo_custkey, int* lo_suppkey, int* lo_revenue, int* lo_supplycost, int lo_len,
+    int* ht_p, int p_len,
+    int* ht_s, int s_len,
+    int* ht_c, int c_len,
+    int* ht_d, int d_len,
+    int* res) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int brand[ITEMS_PER_THREAD];
+  int s_city[ITEMS_PER_THREAD];
+  int year[ITEMS_PER_THREAD];
+  int revenue[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = lo_len - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_suppkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, s_city, selection_flags,
+      ht_s, s_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_custkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_c, c_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_partkey + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, brand, selection_flags,
+      ht_p, p_len, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items);
+  BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, year, selection_flags,
+      ht_d, d_len, 19920101, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_supplycost + tile_offset, items, num_tile_items);
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+      if (selection_flags[ITEM]) {
+        /*int hash = (category[ITEM] * 7 * 25 + s_nation[ITEM] * 7 +  (year[ITEM] - 1992)) % ((1998-1992+1) * 25 * 55);*/
+        int hash = ((year[ITEM] - 1992) * 250 * 1000 + s_city[ITEM] * 1000 + brand[ITEM]) % ((1998-1992+1) * 250 * 1000);
+        res[hash * 4] = year[ITEM];
+        res[hash * 4 + 1] = s_city[ITEM];
+        res[hash * 4 + 2] = brand[ITEM];
+        atomicAdd(&res[hash * 4 + 3], (revenue[ITEM] - items[ITEM]));
+      }
+    }
+  }
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_c(int *filter_col, int *dim_key, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_p(int *filter_col, int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_s(int* filter_col, int *dim_key, int* dim_val, int num_tuples, int *hash_table, int num_slots) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 24, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, 
+      hash_table, num_slots, num_tile_items);
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__
+void build_hashtable_d(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots, int val_min) {
+  int items[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+
+  int tile_offset = blockIdx.x * TILE_SIZE;
+  int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+  int num_tile_items = TILE_SIZE;
+
+  if (blockIdx.x == num_tiles - 1) {
+    num_tile_items = num_tuples - tile_offset;
+  }
+
+  InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+  BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1997, selection_flags, num_tile_items);
+  BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1998, selection_flags, num_tile_items);
+
+  BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+  BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items2, items, selection_flags,
+      hash_table, num_slots, val_min, num_tile_items);
+}
+
+float runQuery(int* lo_orderdate, int* lo_custkey, int* lo_partkey, int* lo_suppkey, int* lo_revenue, int* lo_supplycost, int lo_len,
+    int *d_datekey, int* d_year, int d_len,
+    int *p_partkey, int* p_category, int* p_brand1, int p_len,
+    int *s_suppkey, int* s_nation, int* s_city, int s_len,
+    int *c_custkey, int* c_region, int c_len,
+    cub::CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  int *ht_d, *ht_c, *ht_s, *ht_p;
+  int d_val_len = 19981230 - 19920101 + 1;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+  CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+  CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+  int tile_items = 128*4;
+  build_hashtable_s<128,4><<<(s_len + tile_items - 1)/tile_items, 128>>>(s_nation, s_suppkey, s_city, s_len, ht_s, s_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_c<128,4><<<(c_len + tile_items - 1)/tile_items, 128>>>(c_region, c_custkey, c_len, ht_c, c_len);
+  /*CHECK_ERROR();*/
+
+  build_hashtable_p<128,4><<<(p_len + tile_items - 1)/tile_items, 128>>>(p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+  /*CHECK_ERROR();*/
+
+  int d_val_min = 19920101;
+  build_hashtable_d<128,4><<<(d_len + tile_items - 1)/tile_items, 128>>>(d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+  /*CHECK_ERROR();*/
+
+  int *res;
+  int res_size = ((1998-1992+1) * 250 * 1000);
+  int ht_entries = 4;
+  int res_array_size = res_size * ht_entries;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+  CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+  // Run
+  probe<128,4><<<(lo_len + tile_items - 1)/tile_items, 128>>>(lo_orderdate, lo_partkey,
+          lo_custkey, lo_suppkey, lo_revenue, lo_supplycost, lo_len, ht_p, p_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  cout << "Result:" << endl;
+
+  int* h_res = new int[res_array_size];
+  CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  int res_count = 0;
+  for (int i=0; i<res_size; i++) {
+    if (h_res[4*i] != 0) {
+      cout << h_res[4*i] << " " << h_res[4*i + 1] << " " << h_res[4*i + 2] << " " <<  h_res[4*i + 3] << endl;
+      res_count += 1;
+    }
+  }
+
+  cout << "Res Count: " << res_count << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  delete[] h_res;
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials          = 3;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  args.GetCmdLineArgument("t", num_trials);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+      printf("%s "
+          "[--t=<num trials>] "
+          "[--v] "
+          "\n", argv[0]);
+      exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+  int *h_lo_suppkey = loadColumn<int>("lo_suppkey", LO_LEN);
+  int *h_lo_custkey = loadColumn<int>("lo_custkey", LO_LEN);
+  int *h_lo_partkey = loadColumn<int>("lo_partkey", LO_LEN);
+  int *h_lo_revenue = loadColumn<int>("lo_revenue", LO_LEN);
+  int *h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+  int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+  int *h_d_year = loadColumn<int>("d_year", D_LEN);
+
+  int *h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+  int *h_s_nation = loadColumn<int>("s_nation", S_LEN);
+  int *h_s_city = loadColumn<int>("s_city", S_LEN);
+
+  int *h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+  int *h_p_category = loadColumn<int>("p_category", P_LEN);
+  int *h_p_brand1 = loadColumn<int>("p_brand1", P_LEN);
+
+  int *h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+  int *h_c_region = loadColumn<int>("c_region", C_LEN);
+
+  cout << "** LOADED DATA **" << endl;
+
+  int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator);
+  int *d_lo_custkey = loadToGPU<int>(h_lo_custkey, LO_LEN, g_allocator);
+  int *d_lo_suppkey = loadToGPU<int>(h_lo_suppkey, LO_LEN, g_allocator);
+  int *d_lo_partkey = loadToGPU<int>(h_lo_partkey, LO_LEN, g_allocator);
+  int *d_lo_revenue = loadToGPU<int>(h_lo_revenue, LO_LEN, g_allocator);
+  int *d_lo_supplycost = loadToGPU<int>(h_lo_supplycost, LO_LEN, g_allocator);
+
+  int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+  int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+  int *d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+  int *d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+  int *d_p_brand1 = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+
+  int *d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+  int *d_s_nation = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+  int *d_s_city = loadToGPU<int>(h_s_city, S_LEN, g_allocator);
+
+  int *d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+  int *d_c_region = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+
+  cout << "** LOADED DATA TO GPU **" << endl;
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+    time_query = runQuery(
+        d_lo_orderdate, d_lo_custkey, d_lo_partkey, d_lo_suppkey, d_lo_revenue, d_lo_supplycost, LO_LEN,
+        d_d_datekey, d_d_year, D_LEN,
+        d_p_partkey, d_p_category, d_p_brand1, P_LEN,
+        d_s_suppkey, d_s_nation, d_s_city, S_LEN,
+        d_c_custkey, d_c_region, C_LEN,
+        g_allocator);
+    cout<< "{"
+        << "\"query\":43"
+        << ",\"time_query\":" << time_query
+        << "}" << endl;
+  }
+
+  return 0;
+}
+
diff --git a/crystal/src/ssb/ssb_utils.h b/crystal/src/ssb/ssb_utils.h
new file mode 100644
index 0000000..77f24ef
--- /dev/null
+++ b/crystal/src/ssb/ssb_utils.h
@@ -0,0 +1,107 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+
+/*#include <cuda.h>*/
+/*#include <cub/util_allocator.cuh>*/
+
+using namespace std;
+
+#define SF 10
+
+#define BASE_PATH ""
+
+#if SF == 1
+#define DATA_DIR BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/s1_columnar/"
+#define LO_LEN 6001171
+#define P_LEN 200000
+#define S_LEN 2000
+#define C_LEN 30000
+#define D_LEN 2556
+#elif SF == 10
+#define DATA_DIR BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/s10_columnar/"
+#define LO_LEN 59986214
+#define P_LEN 800000
+#define S_LEN 20000
+#define C_LEN 300000
+#define D_LEN 2556
+#else // 20
+#define DATA_DIR BASE_PATH "s20_columnar/"
+#define LO_LEN 119994746
+#define P_LEN 1000000
+#define S_LEN 40000
+#define C_LEN 600000
+#define D_LEN 2556
+#endif
+
+
+
+int index_of(string* arr, int len, string val) {
+  for (int i=0; i<len; i++)
+    if (arr[i] == val)
+      return i;
+
+  return -1;
+}
+
+string lookup(string col_name) {
+  string lineorder[] = { "lo_orderkey", "lo_linenumber", "lo_custkey", "lo_partkey", "lo_suppkey", "lo_orderdate", "lo_orderpriority", "lo_shippriority", "lo_quantity", "lo_extendedprice", "lo_ordtotalprice", "lo_discount", "lo_revenue", "lo_supplycost", "lo_tax", "lo_commitdate", "lo_shipmode"};
+  string part[] = {"p_partkey", "p_name", "p_mfgr", "p_category", "p_brand1", "p_color", "p_type", "p_size", "p_container"};
+  string supplier[] = {"s_suppkey", "s_name", "s_address", "s_city", "s_nation", "s_region", "s_phone"};
+  string customer[] = {"c_custkey", "c_name", "c_address", "c_city", "c_nation", "c_region", "c_phone", "c_mktsegment"};
+  string date[] = {"d_datekey", "d_date", "d_dayofweek", "d_month", "d_year", "d_yearmonthnum", "d_yearmonth", "d_daynuminweek", "d_daynuminmonth", "d_daynuminyear", "d_sellingseason", "d_lastdayinweekfl", "d_lastdayinmonthfl", "d_holidayfl", "d_weekdayfl"};
+
+  if (col_name[0] == 'l') {
+    int index = index_of(lineorder, 17, col_name);
+    return "LINEORDER" + to_string(index);
+  } else if (col_name[0] == 's') {
+    int index = index_of(supplier, 7, col_name);
+    return "SUPPLIER" + to_string(index);
+  } else if (col_name[0] == 'c') {
+    int index = index_of(customer, 8, col_name);
+    return "CUSTOMER" + to_string(index);
+  } else if (col_name[0] == 'p') {
+    int index = index_of(part, 9, col_name);
+    return "PART" + to_string(index);
+  } else if (col_name[0] == 'd') {
+    int index = index_of(date, 15, col_name);
+    return "DDATE" + to_string(index);
+  }
+
+  return "";
+}
+
+template<typename T>
+T* loadColumn(string col_name, int num_entries) {
+  T* h_col = new T[num_entries];
+  string filename = DATA_DIR + lookup(col_name);
+  ifstream colData (filename.c_str(), ios::in | ios::binary);
+  if (!colData) {
+    return NULL;
+  }
+
+  colData.read((char*)h_col, num_entries * sizeof(T));
+  return h_col;
+}
+
+template<typename T>
+int storeColumn(string col_name, int num_entries, int* h_col) {
+  string filename = DATA_DIR + lookup(col_name);
+  ofstream colData (filename.c_str(), ios::out | ios::binary);
+  if (!colData) {
+    return -1;
+  }
+
+  colData.write((char*)h_col, num_entries * sizeof(T));
+  return 0;
+}
+
+/*int main() {*/
+  //int *h_col = new int[10];
+  //for (int i=0; i<10; i++) h_col[i] = i;
+  //storeColumn<int>("test", 10, h_col);
+  //int *l_col = loadColumn<int>("test", 10);
+  //for (int i=0; i<10; i++) cout << l_col[i] << " ";
+  //cout << endl;
+  //return 0;
+/*}*/
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..bad12d5
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,17 @@
+- how to generate data for `sf=10`
+    ```shell  
+     pip3 install duckdb
+     make -C ssb/dbgen/
+     make -C ssb/loader/
+     python3 util.py ssb 10 gen 
+     chmod 777 ssb/data/s10/date.tbl
+     python3 util.py ssb 10 transform
+     python3 util.py ssb 10 sort
+     echo "change BASE_PATH in crystal_ssb_utils.h and ssb_utils to the right path"
+    ```
+  
+- temp
+  ```shell
+     python3 util.py ssb 10 sort_other_way // experimental
+
+ ```
\ No newline at end of file
diff --git a/data/result_of_queries/q11 b/data/result_of_queries/q11
new file mode 100644
index 0000000..ae202bf
--- /dev/null
+++ b/data/result_of_queries/q11
@@ -0,0 +1 @@
+4468236714181
\ No newline at end of file
diff --git a/data/result_of_queries/q21 b/data/result_of_queries/q21
new file mode 100644
index 0000000..1cc6850
--- /dev/null
+++ b/data/result_of_queries/q21
@@ -0,0 +1,281 @@
+1992 40 6418103797
+1993 40 6382034658
+1994 40 6525665927
+1995 40 6563646251
+1996 40 6578456138
+1997 40 6379340748
+1998 40 3698210388
+1992 41 7215774624
+1993 41 7235770546
+1994 41 7093880636
+1995 41 7156912379
+1996 41 7158257544
+1997 41 6890416180
+1998 41 4204327203
+1992 42 6601010229
+1993 42 6661380481
+1994 42 6805618335
+1995 42 6539366376
+1996 42 6832348876
+1997 42 6571540214
+1998 42 3978744872
+1992 43 7197665271
+1993 43 6945853876
+1994 43 6916324003
+1995 43 6871377174
+1996 43 6832025455
+1997 43 7045010342
+1998 43 4258248122
+1992 44 6727625998
+1993 44 6312893492
+1994 44 6562962487
+1995 44 6229499393
+1996 44 6564940527
+1997 44 6510029432
+1998 44 3893760987
+1992 45 6461380401
+1993 45 6558772163
+1994 45 6456432352
+1995 45 6239560871
+1996 45 6519111073
+1997 45 6390521469
+1998 45 3755042585
+1992 46 7012618906
+1993 46 6783420789
+1994 46 6677778187
+1995 46 7063055884
+1996 46 6717830860
+1997 46 6825894334
+1998 46 4019728654
+1992 47 6730883299
+1993 47 6673551973
+1994 47 6889765819
+1995 47 6873007289
+1996 47 6691029370
+1997 47 6656963454
+1998 47 3860509210
+1992 48 6125452526
+1993 48 6387782986
+1994 48 6220999101
+1995 48 6266773366
+1996 48 6086559287
+1997 48 6096975918
+1998 48 3424123719
+1992 49 6656732408
+1993 49 6439163794
+1994 49 7086198960
+1995 49 6801494826
+1996 49 6380057064
+1997 49 6624824324
+1998 49 4162459164
+1992 50 6960570696
+1993 50 7237748553
+1994 50 6871669861
+1995 50 7141121473
+1996 50 7137131058
+1997 50 6771705645
+1998 50 4259523518
+1992 51 6574132099
+1993 51 6358522943
+1994 51 6572086846
+1995 51 6401898562
+1996 51 6410252673
+1997 51 6426732319
+1998 51 3553327368
+1992 52 7047940337
+1993 52 6944255619
+1994 52 6773948949
+1995 52 6714537523
+1996 52 6831573122
+1997 52 6734349931
+1998 52 3804080515
+1992 53 6783046496
+1993 53 6764674340
+1994 53 6740138960
+1995 53 7013631699
+1996 53 6488850668
+1997 53 6762927970
+1998 53 3958631518
+1992 54 7105405127
+1993 54 7036373470
+1994 54 7019061940
+1995 54 6646909733
+1996 54 6546458610
+1997 54 6747693662
+1998 54 3959735030
+1992 55 7292018911
+1993 55 6919737436
+1994 55 6782433544
+1995 55 6603309817
+1996 55 6796414799
+1997 55 6930467621
+1998 55 3936336506
+1992 56 7416913901
+1993 56 7023034684
+1994 56 6886859642
+1995 56 7204223670
+1996 56 7307030629
+1997 56 7278012359
+1998 56 4195358018
+1992 57 7072285707
+1993 57 6769724436
+1994 57 7157349757
+1995 57 6786320672
+1996 57 7083167031
+1997 57 6867387556
+1998 57 3959867848
+1992 58 6668044014
+1993 58 6985920856
+1994 58 6596737151
+1995 58 6659827925
+1996 58 6454616521
+1997 58 6778311943
+1998 58 3972101307
+1992 59 6326657255
+1993 59 6484432568
+1994 59 6408868609
+1995 59 6616633932
+1996 59 6679260631
+1997 59 6372927264
+1998 59 3846617983
+1992 60 7136768663
+1993 60 6878576876
+1994 60 6741527996
+1995 60 6769490915
+1996 60 6903254888
+1997 60 6960241189
+1998 60 4130144930
+1992 61 6738756530
+1993 61 6636715879
+1994 61 6763648338
+1995 61 6617746150
+1996 61 6892805375
+1997 61 6536932124
+1998 61 3835627586
+1992 62 6164211920
+1993 62 6342711015
+1994 62 6300859287
+1995 62 6732584121
+1996 62 6226736904
+1997 62 6431206336
+1998 62 3948296506
+1992 63 7049023811
+1993 63 7257074782
+1994 63 7201434704
+1995 63 7143627518
+1996 63 7218551955
+1997 63 7122881926
+1998 63 4229058658
+1992 64 6918049898
+1993 64 6501113968
+1994 64 7009023813
+1995 64 6718571799
+1996 64 6763605438
+1997 64 7104613185
+1998 64 3979838975
+1992 65 7152976211
+1993 65 7458190031
+1994 65 6937644159
+1995 65 6997669629
+1996 65 6758440512
+1997 65 6673821228
+1998 65 4322914592
+1992 66 6395458610
+1993 66 6409727300
+1994 66 6143698484
+1995 66 6290853513
+1996 66 6562894079
+1997 66 6580940135
+1998 66 3835167462
+1992 67 6844550759
+1993 67 7336259130
+1994 67 7471700197
+1995 67 7040096938
+1996 67 7077296627
+1997 67 7573027340
+1998 67 4445654176
+1992 68 6586538461
+1993 68 6329817914
+1994 68 6495180880
+1995 68 6424478604
+1996 68 6541879729
+1997 68 6614661298
+1998 68 3725370328
+1992 69 6437149944
+1993 69 6642939280
+1994 69 6493295161
+1995 69 6642164323
+1996 69 6502125649
+1997 69 6745438347
+1998 69 3683114400
+1992 70 6956560451
+1993 70 7000267344
+1994 70 6510307841
+1995 70 6337688211
+1996 70 7034861207
+1997 70 6184992923
+1998 70 3792560046
+1992 71 7503528393
+1993 71 7311857458
+1994 71 7877750677
+1995 71 7548275489
+1996 71 7299486342
+1997 71 7130260446
+1998 71 4364145775
+1992 72 6773151840
+1993 72 6705723103
+1994 72 6576032819
+1995 72 6874053112
+1996 72 6405666522
+1997 72 6755654898
+1998 72 4000181003
+1992 73 6642879253
+1993 73 6637877324
+1994 73 6954631030
+1995 73 6454765835
+1996 73 6598056575
+1997 73 6785666627
+1998 73 3680466597
+1992 74 7338251519
+1993 74 7059280620
+1994 74 7061358044
+1995 74 7044215380
+1996 74 7332374720
+1997 74 7010120097
+1998 74 4382878336
+1992 75 6993738734
+1993 75 7079353328
+1994 75 7136558061
+1995 75 7044955465
+1996 75 6832731514
+1997 75 6839863219
+1998 75 4144643700
+1992 76 7216305524
+1993 76 7243703041
+1994 76 7274388343
+1995 76 7233806943
+1996 76 6971700893
+1997 76 7041104465
+1998 76 3945668122
+1992 77 5945520673
+1993 77 6149061528
+1994 77 5791875920
+1995 77 5953806237
+1996 77 6094857618
+1997 77 6161765944
+1998 77 3589143954
+1992 78 6082662735
+1993 78 6384605378
+1994 78 6458393784
+1995 78 6165718089
+1996 78 6742644418
+1997 78 6420560847
+1998 78 3695789321
+1992 79 7126061027
+1993 79 7055353878
+1994 79 7101978837
+1995 79 6781640340
+1996 79 7257992096
+1997 79 6791270791
+1998 79 4070644777
+Res Count: 280
\ No newline at end of file
diff --git a/data/result_of_queries/q31 b/data/result_of_queries/q31
new file mode 100644
index 0000000..0f82ec5
--- /dev/null
+++ b/data/result_of_queries/q31
@@ -0,0 +1,151 @@
+1992 8 8 53840255574
+1993 8 8 53166216941
+1994 8 8 53437240310
+1995 8 8 53396799768
+1996 8 8 54110132821
+1997 8 8 53398173290
+1992 9 8 56083363742
+1993 9 8 55223660082
+1994 9 8 55339397030
+1995 9 8 54949301113
+1996 9 8 55903082845
+1997 9 8 54769022116
+1992 12 8 52837317579
+1993 12 8 53383468103
+1994 12 8 52207914158
+1995 12 8 52862670951
+1996 12 8 52829409093
+1997 12 8 53020471016
+1992 18 8 53363391476
+1993 18 8 52946193531
+1994 18 8 52997321941
+1995 18 8 53672123936
+1996 18 8 53436907487
+1997 18 8 54352229494
+1992 21 8 53796356168
+1993 21 8 54304168176
+1994 21 8 53974392943
+1995 21 8 53857720297
+1996 21 8 54093512752
+1997 21 8 53598437998
+1992 8 9 55334149561
+1993 8 9 55669527348
+1994 8 9 54838930433
+1995 8 9 55981258937
+1996 8 9 56549465183
+1997 8 9 55451474341
+1992 9 9 57493556858
+1993 9 9 58025342779
+1994 9 9 57308767649
+1995 9 9 57866394299
+1996 9 9 58676834632
+1997 9 9 57151657961
+1992 12 9 55470271862
+1993 12 9 55450422145
+1994 12 9 55166732599
+1995 12 9 55756628069
+1996 12 9 55295862862
+1997 12 9 53377511976
+1992 18 9 56294215648
+1993 18 9 56167494867
+1994 18 9 55456868802
+1995 18 9 55888788272
+1996 18 9 56240855720
+1997 18 9 55624174081
+1992 21 9 56528084092
+1993 21 9 57031719413
+1994 21 9 56459028335
+1995 21 9 57672132145
+1996 21 9 56293030145
+1997 21 9 56215096026
+1992 8 12 51104583944
+1993 8 12 52291194128
+1994 8 12 52149700327
+1995 8 12 51756734585
+1996 8 12 52743929158
+1997 8 12 53618521846
+1992 9 12 54393567369
+1993 9 12 53410059754
+1994 9 12 53932060476
+1995 9 12 54260687958
+1996 9 12 55113622290
+1997 9 12 53884139975
+1992 12 12 52781570092
+1993 12 12 52683527061
+1994 12 12 50283319443
+1995 12 12 51809888688
+1996 12 12 52500376734
+1997 12 12 50815598125
+1992 18 12 52836119396
+1993 18 12 52943818670
+1994 18 12 52153400982
+1995 18 12 51839229204
+1996 18 12 53030051819
+1997 18 12 53281809182
+1992 21 12 53377618064
+1993 21 12 53970340911
+1994 21 12 54078621677
+1995 21 12 53174393671
+1996 21 12 52256511400
+1997 21 12 53064919288
+1992 8 18 51758985311
+1993 8 18 52173652875
+1994 8 18 52604990324
+1995 8 18 52587898615
+1996 8 18 51780518836
+1997 8 18 51906203038
+1992 9 18 53887104795
+1993 9 18 53920040836
+1994 9 18 53898996978
+1995 9 18 54122679431
+1996 9 18 54303106396
+1997 9 18 54430180840
+1992 12 18 51465172557
+1993 12 18 51406709327
+1994 12 18 52099528581
+1995 12 18 51320895827
+1996 12 18 51326040782
+1997 12 18 52052860907
+1992 18 18 53202304966
+1993 18 18 52197097507
+1994 18 18 52525946124
+1995 18 18 52421548431
+1996 18 18 53671108592
+1997 18 18 52788981021
+1992 21 18 53635069027
+1993 21 18 53806768582
+1994 21 18 52744648993
+1995 21 18 52784240366
+1996 21 18 53641429016
+1997 21 18 52632344235
+1992 8 21 49640993819
+1993 8 21 50661144654
+1994 8 21 50362372598
+1995 8 21 50516483322
+1996 8 21 51123449982
+1997 8 21 51125299004
+1992 9 21 51006397394
+1993 9 21 51824859693
+1994 9 21 51996233504
+1995 9 21 51968286051
+1996 9 21 53096102262
+1997 9 21 51759284236
+1992 12 21 49650941206
+1993 12 21 50057832135
+1994 12 21 50097922236
+1995 12 21 48627408805
+1996 12 21 50197634124
+1997 12 21 49149703784
+1992 18 21 50875757574
+1993 18 21 50618892442
+1994 18 21 50420152423
+1995 18 21 50255208143
+1996 18 21 50798876344
+1997 18 21 50981837552
+1992 21 21 49991192802
+1993 21 21 51428673225
+1994 21 21 49946254861
+1995 21 21 51328670072
+1996 21 21 50456326144
+1997 21 21 50401588878
+Res Count: 150
diff --git a/data/result_of_queries/q41 b/data/result_of_queries/q41
new file mode 100644
index 0000000..4d0188b
--- /dev/null
+++ b/data/result_of_queries/q41
@@ -0,0 +1,36 @@
+1992 1 103719745491
+1993 1 104804149905
+1994 1 102680809322
+1995 1 104521470391
+1996 1 105409529511
+1997 1 103520208117
+1998 1 60245313373
+1992 2 106246161239
+1993 2 106198050501
+1994 2 106093079488
+1995 2 107568611750
+1996 2 106880639017
+1997 2 106690124662
+1998 2 61912349455
+1992 3 106647931375
+1993 3 107048690889
+1994 3 104514167652
+1995 3 105315997395
+1996 3 105586646448
+1997 3 106924659923
+1998 3 62738136949
+1992 17 104134609838
+1993 17 104651610426
+1994 17 104257308810
+1995 17 104390879969
+1996 17 105890415529
+1997 17 104161057567
+1998 17 62706700969
+1992 24 105245006839
+1993 24 104166556157
+1994 24 107595107297
+1995 24 104996502880
+1996 24 104859848521
+1997 24 105030361725
+1998 24 62169336083
+Res Count: 35
\ No newline at end of file
diff --git a/data/ssb/.gitignore b/data/ssb/.gitignore
new file mode 100644
index 0000000..5d04e56
--- /dev/null
+++ b/data/ssb/.gitignore
@@ -0,0 +1,4 @@
+data/
+dbgen/*.o
+dbgen/dbgen
+loader/loader
diff --git a/data/ssb/SSB.md b/data/ssb/SSB.md
new file mode 100644
index 0000000..2b58538
--- /dev/null
+++ b/data/ssb/SSB.md
@@ -0,0 +1,372 @@
+Star Schema Benchmark Queries
+=============================
+
+
+Queries
+-------
+
+q11
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder,date
+where lo_orderdate = d_datekey
+and d_year = 1993 and lo_discount>=1
+and lo_discount<=3
+and lo_quantity<25;
+
+q11.m
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19930101 and lo_orderdate <= 19940101 and lo_discount>=1
+and lo_discount<=3
+and lo_quantity<25;
+
+q12
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder,date
+where lo_orderdate = d_datekey
+and d_yearmonthnum = 199401
+and lo_discount>=4
+and lo_discount<=6
+and lo_quantity>=26
+and lo_quantity<=35;
+
+q12.m
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19940101 and lo_orderdate <= 19940131 
+and lo_discount>=4 and lo_discount<=6
+and lo_quantity>=26
+and lo_quantity<=35;
+
+q13
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder,date
+where lo_orderdate = d_datekey
+and d_weeknuminyear = 6
+and d_year = 1994
+and lo_discount>=5
+and lo_discount<=7
+and lo_quantity>=26
+and lo_quantity<=35;
+
+q13.m
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19940204
+and lo_orderdate <= 19940210
+and lo_discount>=5
+and lo_discount<=7
+and lo_quantity>=26
+and lo_quantity<=35;
+
+q21
+
+select sum(lo_revenue),d_year,p_brand1
+from lineorder,part,supplier,date
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_category = 'MFGR#12'
+and s_region = 'AMERICA'
+group by d_year,p_brand1
+order by d_year,p_brand1;
+
+q21.m
+
+select sum(lo_revenue),d_year,p_brand1
+from lineorder,part,supplier,ddate
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_category = 1
+and s_region = 1
+group by d_year,p_brand1
+order by d_year,p_brand1;
+
+q22
+
+select sum(lo_revenue),d_year,p_brand1
+from lineorder, part, supplier,date
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 >= 'MFGR#2221'
+and p_brand1 <= 'MFGR#2228'
+and s_region = 'ASIA'
+group by d_year,p_brand1
+order by d_year,p_brand1;
+
+q22.m
+
+select sum(lo_revenue),d_year,p_brand1
+from lineorder, part, supplier,ddate
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 >= 260
+and p_brand1 <= 267
+and s_region = 2
+group by d_year,p_brand1
+order by d_year,p_brand1;
+
+q23
+
+select sum(lo_revenue),d_year,p_brand1
+from lineorder,part,supplier,date
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 = 'MFGR#2239'
+and s_region = 'EUROPE'
+group by d_year,p_brand1
+order by d_year,p_brand1;
+
+q23.m
+
+select sum(lo_revenue),d_year,p_brand1
+from lineorder,part,supplier,ddate
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 = 278
+and s_region = 3
+group by d_year,p_brand1
+order by d_year,p_brand1;
+
+Dictionary Encoding
+America => 1
+Asia => 2
+Europe => 3
+
+q31 [Aggregates greater than int]
+
+select c_nation,s_nation,d_year,sum(lo_revenue) as revenue
+from lineorder,customer, supplier,date
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_region = 'ASIA'
+and s_region = 'ASIA'
+and d_year >= 1992 and d_year <= 1997
+group by c_nation,s_nation,d_year
+order by d_year asc,revenue desc;
+
+q31.m
+
+select c_nation,s_nation,d_year,sum(lo_revenue) as revenue
+from lineorder,customer, supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_region = 2
+and s_region = 2
+and d_year >= 1992 and d_year <= 1997
+group by c_nation,s_nation,d_year
+order by d_year asc,revenue desc;
+
+q32
+
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,date
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_nation = 'UNITED STATES'
+and s_nation = 'UNITED STATES'
+and d_year >=1992 and d_year <= 1997
+group by c_city,s_city,d_year
+order by d_year asc,revenue desc;
+
+q32.m
+
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_nation = 24
+and s_nation = 24
+and d_year >=1992 and d_year <= 1997
+group by c_city,s_city,d_year
+order by d_year asc,revenue desc;
+
+q33
+
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city = 'UNITED KI1' or c_city = 'UNITED KI5')
+and (s_city = 'UNITED KI1' or s_city = 'UNITED KI5')
+and d_year >=1992 and d_year <= 1997
+group by c_city,s_city,d_year
+order by d_year asc,revenue desc;
+
+q33.m
+
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city = 231 or c_city = 235)
+and (s_city = 231 or s_city = 235)
+and d_year >=1992 and d_year <= 1997
+group by c_city,s_city,d_year
+order by d_year asc,revenue desc;
+
+q34
+
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,date
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city = 'UNITED KI1' or c_city = 'UNITED KI5')
+and (s_city = 'UNITED KI1' or s_city = 'UNITED KI5')
+and d_yearmonth = 'Dec1997'
+group by c_city,s_city,d_year
+order by d_year asc,revenue desc;
+
+q34.m
+
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city = 231 or c_city = 235)
+and (s_city = 231 or s_city = 235)
+and d_yearmonthnum = 199712
+group by c_city,s_city,d_year
+order by d_year asc,revenue desc;
+
+ASIA => 2
+UNITED STATES => 24
+UNITED KI1 => 231
+UNITED KI5 => 235
+
+q41 [Aggregates greater than int]
+
+select d_year,c_nation,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,supplier,customer,part, date
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 'AMERICA'
+and s_region = 'AMERICA'
+and (p_mfgr = 'MFGR#1' or p_mfgr = 'MFGR#2')
+group by d_year,c_nation
+order by d_year,c_nation;
+
+q41.m
+
+select d_year,c_nation,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,supplier,customer,part,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 1
+and s_region = 1
+and (p_mfgr = 0 or p_mfgr = 1)
+group by d_year,c_nation
+order by d_year,c_nation;
+
+q42
+
+select d_year,s_nation,p_category,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,customer,supplier,part,date
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 'AMERICA'
+and s_region = 'AMERICA'
+and (d_year = 1997 or d_year = 1998)
+and (p_mfgr = 'MFGR#1' or p_mfgr = 'MFGR#2')
+group by d_year,s_nation, p_category
+order by d_year,s_nation, p_category;
+
+q42.m
+
+select d_year,s_nation,p_category,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,customer,supplier,part,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 1
+and s_region = 1
+and (d_year = 1997 or d_year = 1998)
+and (p_mfgr = 0 or p_mfgr = 1)
+group by d_year,s_nation, p_category
+order by d_year,s_nation, p_category;
+
+q43
+
+select d_year,s_city,p_brand1,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,supplier,customer,part,date
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 'AMERICA'
+and s_nation = 'UNITED STATES'
+and (d_year = 1997 or d_year = 1998)
+and p_category = 'MFGR#14'
+group by d_year,s_city,p_brand1
+order by d_year,s_city,p_brand1;
+
+q43.m
+
+select d_year,s_city,p_brand1,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,supplier,customer,part,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 1
+and s_nation = 24
+and (d_year = 1997 or d_year = 1998)
+and p_category = 3
+group by d_year,s_city,p_brand1
+order by d_year,s_city,p_brand1;
+
+AMERICA => 1
+MFGR#1 => 1
+MFGR#2 => 2
+UNITED STATES => 24
+
+Data Generation
+----------------
+
+SF 1:
+./gpuDBLoaderM --lineorder ../../test/dbgen/lineorder.tbl --ddate ../../test/dbgen/date.tbl --customer ../../test/dbgen/customer.tbl.p --supplier ../../test/dbgen/supplier.tbl.p --part ../../test/dbgen/part.tbl.p --datadir ../../dataM1/
+
+SF 10:
+./gpuDBLoaderM --customer ../../data-raw10/customer.tbl.p --supplier ../../data-raw10/supplier.tbl.p --part ../../data-raw10/part.tbl.p --datadir ../../dataM10/
+
+SF 20:
+./gpuDBLoaderM --lineorder ../../data-raw20/lineorder.tbl --ddate ../../data-raw20/date.tbl --customer ../../data-raw20/customer.tbl.p --supplier ../../data-raw20/supplier.tbl.p --part ../../data-raw20/part.tbl.p --datadir ../../dataM20/
+
+
+python convert.py
+
+Inefficiencies
+-------------
+
+* Hash function (eg: for q23)
+
+Hyper
+---
+
+./bin/driver /big_fast_drive/anil/dbops/test/ssb/schema.sql /big_fast_drive/anil/dbops/test/ssb/load.sql --store ssb_transformed.dump
diff --git a/data/ssb/dbgen/.gitignore b/data/ssb/dbgen/.gitignore
new file mode 100644
index 0000000..665a9df
--- /dev/null
+++ b/data/ssb/dbgen/.gitignore
@@ -0,0 +1,6 @@
+*.o
+*.tbl
+qgen
+dbgen
+
+.vscode
diff --git a/data/ssb/dbgen/BUGS b/data/ssb/dbgen/BUGS
new file mode 100644
index 0000000..1f1b2ab
--- /dev/null
+++ b/data/ssb/dbgen/BUGS
@@ -0,0 +1,987 @@
+# @(#) BUGS 2.1.8.20@(#)
+# The following is a list of the various DBGEN/QGEN bugs that have been
+# and are being fixed. Each entry is of the form:
+# 
+#	Problem #xx: STATUS   -- MR ID and OPEN/closed
+#                        followed by a detailed explanation
+#    TYPE:            -- classification of the bug or issue
+#    SPEC FIX:        -- details of any change to the spec
+#    DBGEN FIX:       -- details of any change needed to QGEN/DBGEN
+#    ANSWER SETS:     -- any effect on answer sets
+#    WORKAROUND:      -- temporary fix, if available
+#    HELP NEEDED:     -- any work/assistance required
+#    AUDITORS NOTIFIED:  -- date auditors were notified, if appropriate
+#    OPENED AGAINST:     -- date and effected versions
+#    CLOSED IN:       -- date and fixed version
+#
+#	OPEN BUGS
+#	==========
+#	Problem #33: Parallel load doesn't work under NT
+#
+#	OPEN Feature Requests
+#	=================
+#	Problem #9:  would like to include answer set formatting in query templates
+#	Problem #37: need way to validate DBGEN without large storage requriement
+#	Problem #58: Need way to track changes from one release to the next
+#	
+#	OPEN Documentation Errors
+#	=================
+#	None
+#---------------------------------------------------------------------
+#Complete Bug List
+#==================
+Problem #1: closed
+Summary: Q10 returns no rows
+	Since orders can only be returned (l_returnflag = 'R') after they 
+	have been received, and can't be received in the future, the
+	number of permissible orders for query 10 tails off early in
+	1995. If you are lucky enough to get a parameter substitution
+	after February '95 (allowed in 2.12.3), things can go "quickly".
+SEVERITY:
+SPEC FIX: replace 2.12.3 (1) with "DATE is the first day in a
+	rundomly selected month between the first month of 1993 and the
+	last month of 1994"
+DBGEN FIX: change permisible substitution range for query 10, 
+	parameter 1
+ANSWER SETS: not effected.
+WORKAROUND: use a different seed for qgen parameter substitution
+HELP NEEDED:
+AUDITORS NOTIFIED:	
+OPENED AGAINST: 1.0
+CLOSED IN:      1.0.1 (dbgen and qgen)
+
+Problem #2: closed
+Summary: parallelism in load to gen differing data sets
+the parallel load code was based on extensible data sets; since
+	 each "extension" made an assumption of scale factor, the data
+	 could end up clustered. Further, since the RNG is
+	 self-modifying, different numbers of extension led to different
+	 final data sets.
+SEVERITY:
+SPEC FIX: none.
+DBGEN FIX: remove -E(xtensible) option and implement pure parallel
+load with a known scale factor; rebuild seed files
+ANSWER SETS: not effected. (parallelism not implemented for SF <= 1)
+WORKAROUND: don't use the parallel load (-C) option to DBGEN
+HELP NEEDED: testers needed.
+AUDITORS NOTIFIED:	yes.
+OPENED AGAINST:     1.0
+CLOSED IN:          1.0.1
+
+Problem #3: closed
+Summary: some arithmetic tends to overflow at large SF
+retailprice tends to SF/10 as SF increases. this can lead to
+	 data corruption in extendedprice and aggregate calculations
+SEVERITY:
+SPEC FIX: will need rework of 1.3 wrt retailprice calculation
+DBGEN FIX: modification to second term of rpb_routine() calcuation
+to limit contibution of second term to the maximum seen at 
+	       SF=.1
+ANSWER SETS: not effected
+WORKAROUND: code retail/extended price calculations as long long;
+build smaller data sets
+HELP NEEDED:
+AUDITORS NOTIFIED:	
+OPENED AGAINST: 1.0
+CLOSED IN:      1.0.1
+
+Problem #4: closed
+Summary: dbgen not ported to NT
+SEVERITY:
+SPEC FIX:  none
+DBGEN FIX: need to roll in changes supplied by IBM
+ANSWER SETS: not effected
+WORKAROUND: N/A
+HELP NEEDED: N/A
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0
+CLOSED IN: 1.1.0
+
+Problem #5: closed
+Summary: QGEN seed init inconsistent
+A prior fix assured that parameter values were query order
+	 independent when a seed was provided on the command line. need
+	 to make this true when no seed is provided
+SEVERITY:
+SPEC FIX: none
+DBGEN FIX: rework seed init loop in qgen.c
+ANSWER SETS: not effected
+WORKAROUND: supply seeds on command line
+HELP NEEDED: none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0
+CLOSED IN: 1.0.1
+
+Problem #6: closed
+Summary: command line options with abutting arguments mishandled
+SEVERITY:
+SPEC FIX:  none
+DBGEN FIX: minor fix to getopt routine in bm_utils.c
+ANSWER SETS: not effected
+WORKAROUND: separate options and arguments with a space
+HELP NEEDED: none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0
+CLOSED IN: 1.0.1
+
+Problem #7: closed
+Summary: '-O f' asking for new file names twice
+SEVERITY:
+SPEC FIX:  none
+DBGEN FIX: rework of set_files() in driver.c
+ANSWER SETS: not effected
+WORKAROUND: none
+HELP NEEDED: none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0
+CLOSED IN: 1.0.1
+
+Problem #8: closed
+Summary: Seed generation taking too long
+SEVERITY:
+SPEC FIX: N/A
+DBGEN FIX: implement "skip and trudge" as discussed
+ANSWER SETS:  not effected
+WORKAROUND: none
+HELP NEEDED: 
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0
+CLOSED IN: 1.0.1
+
+Problem #9: OPEN
+Summary: would like to include answer set formatting in query templates
+SEVERITY: feature request
+SPEC FIX: none
+DBGEN FIX: additional flag in qgen()
+ANSWER SETS: not effected
+WORKAROUND: N/A
+HELP NEEDED: asked for reproduction info 25 Oct 95
+AUDITORS NOTIFIED:	 N/A
+OPENED AGAINST: 1.0
+CLOSED IN:
+
+Problem #10: closed
+Summary: need to re-introduce ability to do incremental, flat file builds
+SEVERITY: feature request
+SPEC FIX: none
+DBGEN FIX: add -S(tep) option to build one of many partial data sets
+ANSWER SETS: not effected
+WORKAROUND: N/A
+HELP NEEDED:
+AUDITORS NOTIFIED:	 N/A
+OPENED AGAINST: 1.0
+CLOSED IN: 1.0.1
+
+Problem #11: closed
+Summary: Row count for first delete at 10/100 is incorrect
+SEVERITY: Error
+SPEC FIX: None
+DBGEN FIX: 
+ANSWER SETS: No Effect
+WORKAROUND: hand edit of first delete file
+HELP NEEDED:
+AUDITORS NOTIFIED:	No
+OPENED AGAINST: 1.0.1
+CLOSED IN:		2.0.0 (not sure of precise release)
+CLOSED BY:      jms@gradientsystems.com
+
+Problem #12: closed
+Summary: Bad default rowcount generated for query 17
+SEVERITY: Error
+SPEC FIX: None
+DBGEN FIX: corrected rowcnt[] entries to be 1-based
+ANSWER SETS: N/A
+WORKAROUND: hand edit query or add explicit row count to template
+HELP NEEDED:
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0.0
+CLOSED IN: 1.1.0
+
+Problem #13: closed
+Summary: Bad expansion of SET_OUTPUT for Teradata
+SEVERITY: Error
+SPEC FIX: N/A
+DBGEN FIX: new macro in tpcd.h
+ANSWER SETS:  N/A
+WORKAROUND: Hand edit query or hardcode output directive in templates
+HELP NEEDED:
+AUDITORS NOTIFIED:	 N/A
+OPENED AGAINST: 1.0.1
+CLOSED IN: 1.1.0
+
+Problem #14: closed
+Summary: Badly formed range deletes
+SEVERITY: Error
+SPEC FIX: N/A
+DBGEN FIX: TBD
+ANSWER SETS:  N/A
+WORKAROUND: hand edit delete files
+HELP NEEDED: asked for reproduction info 25 Oct 95
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0.1
+CLOSED IN:		2.0.0 (not sure of precise release)
+CLOSED BY:      jms@gradientsystems.com
+
+Problem #15: closed
+Summary: in a multi-stage load, parent tables are not properly named 
+when parent and child are build simultaneously
+SEVERITY: Error
+SPEC FIX: N/A
+DBGEN FIX: reworked tdef[].name in pr_X_Y routines for master/detail
+tables
+ANSWER SETS: N/A
+WORKAROUND: Build master/detail tables separately
+HELP NEEDED:
+AUDITORS NOTIFIED:	 N/A
+OPENED AGAINST: 1.0.1
+CLOSED IN: 1.1.0
+
+Problem #16: closed
+Summary: update generation at large scale factors produced the wrong number
+of rows due to overflow of 32-bit integer
+SEVERITY: BUG
+SPEC FIX: N/A
+DBGEN FIX: corrected order of operations in row count calcuation in 
+driver.c
+ANSWER SETS:  N/A
+WORKAROUND:  use 64 bit integers
+HELP NEEDED: None
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0.1
+CLOSED IN: 1.1.0
+
+Problem #17: closed
+Summary: comment fields may be truncated when using columnar output, due to
+rounding/truncation in the length calculation
+SEVERITY: BUG
+SPEC FIX: N/A
+DBGEN FIX: add ceil() calls around all PR_VSTR() calls in print.c
+ANSWER SETS: N/A
+WORKAROUND: N/A
+HELP NEEDED: None
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.0.1
+CLOSED IN: 1.1.0
+
+Problem #18: closed
+Summary: the output format for identifier fields in columnar output is
+unneccessarily large, and is inconsistant
+SEVERITY: minor
+SPEC FIX: N/A
+DBGEN FIX: revised PR_BCD2 macro
+ANSWER SETS: N/A
+WORKAROUND: avoid columnar output, or rework macro
+HELP NEEDED: none
+AUDITORS NOTIFIED:	no
+OPENED AGAINST: 1.1.0
+OPENED BY:      jenn@torolab2.vnet.ibm.com
+CLOSED IN:      1.1.0A
+CLOSED BY:      jms@informix.com
+
+Problem #19: closed
+Summary: the case statement used to decipher substitution points in the 
+query template allowed extraneous :'s to re-initialize the 
+parameter substitution
+SEVERITY:     bug
+SPEC FIX:     N/A
+DBGEN FIX:    rework flag switch in qgen.c to explicitly call out numerics
+ANSWER SETS:  N/A
+WORKAROUND:   be sure that there are no "unknown" flags in the template
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	yes
+OPENED AGAINST: 1.0.1
+OPENED BY:     jenn@torolab2.vnet.ibm.com
+CLOSED IN:     1.1.0A
+CLOSED BY:     jms@informix.com 
+
+Problem #20: closed
+Summary: parameter substitution values were not effected by small changes 
+in seed values
+SEVERITY:     bug
+SPEC FIX:     N/A
+DBGEN FIX:    add UnifInt() calls to RNG init in qgen.c
+ANSWER SETS:  N/A
+WORKAROUND:   be sure seed values provide sufficient randomness in EQT
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	yes
+OPENED AGAINST: 1.1.0
+OPENED BY:     alain_crolotte@elsegundoca.attgis.com
+CLOSED IN:     1.1.0B
+CLOSED BY:     jms@informix.com 
+
+Problem #21: closed
+Summary: parameter logging doesn't properly handle the variable length of
+the substitution list
+SEVERITY:     bug
+SPEC FIX:     N/A
+DBGEN FIX:    assure null termination of param list and bound the output
+loop that logs parameter usage
+ANSWER SETS:  N/A
+WORKAROUND:   none
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.1.0B
+OPENED BY:     
+CLOSED IN: 1.1.0C
+CLOSED BY:     jms@informix.com 
+
+Problem #22: closed
+Summary: parameter output for Q11 can overflow default formatting at very
+large volumes
+SEVERITY:     bug
+SPEC FIX:     N/A
+DBGEN FIX:    expand format string to %11.10f
+ANSWER SETS:  N/A
+WORKAROUND:   hand code queries for large volumes
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST: 1.1.0B
+OPENED BY:     francois@ip.com
+CLOSED IN:     1.1.0C
+CLOSED BY:      jms@informix.com
+
+Problem #23: closed
+Summary: typos in variant 14c
+SEVERITY:     
+SPEC FIX:    N/A
+DBGEN FIX:   corrected query template
+ANSWER SETS: N/A
+WORKAROUND:  none
+HELP NEEDED: none
+AUDITORS NOTIFIED:	no
+OPENED AGAINST: 1.1.0B
+OPENED BY:    francois@ip.com 
+CLOSED IN:    1.1.0C
+CLOSED BY:    jms@informix.com  
+
+Problem #24: closed
+Summary: macro PR_DATE was hard-coded to print t->alpha even though a 
+target was passed in as a parameter
+SEVERITY:     minor
+SPEC FIX:     N/A
+SOURCE FIX:   re-worked macro to properly use its arguments 
+ANSWER SETS:  N/A
+WORKAROUND:   none
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	no
+OPENED AGAINST:  1.1.0A
+OPENED BY:       Robert.Lane@eng.sun.com
+CLOSED IN:       dbgen 1.1.0B
+CLOSED BY:       jms@informix.com
+
+Problem #25: closed
+Summary: typos in variant 10a
+SEVERITY:     
+SPEC FIX:    N/A
+DBGEN FIX:   corrected query template
+ANSWER SETS: N/A
+WORKAROUND:  none
+HELP NEEDED: none
+AUDITORS NOTIFIED:	no
+OPENED AGAINST: 1.1.0B
+OPENED BY:    francois@ip.com 
+CLOSED IN:    1.1.0C
+CLOSED BY:    jms@informix.com  
+
+Problem #26: closed
+Summary: the version numbers for QGEN and DBGEN do not match
+SEVERITY:     minor
+SPEC FIX:     N/A
+SOURCE FIX:   unified version numbers starting with 1.1.0C
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	no
+OPENED AGAINST:  1.1.0B (or 1.1.0C, depending)
+OPENED BY:       Robert.Lane@eng.sun.com
+CLOSED IN:       1.1.0C
+CLOSED BY:       jms@informix.com
+
+Problem #27: closed
+Summary: correcting typos in 7, 9, 13
+SEVERITY:     minor
+SPEC FIX:     N/A
+SOURCE FIX:   fixed them
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST:  1.1.0C (pre-release)
+OPENED BY:       tblank@vnet.ibm.com
+CLOSED IN:       1.1.0C
+CLOSED BY:       jms@informix.com
+
+Problem #28: closed
+Summary: Seed generation fails with SF > 1000 due to 32 bit integer
+arithmetic used to verify "divisible-ness" of data set
+SEVERITY:     bug
+SPEC FIX:     N/A
+SOURCE FIX:   TBD
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST:  1.1.0C
+OPENED BY:       alain_colotte@elsegundoca.ncr.com
+CLOSED IN:      1.3.0 
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #29: closed
+Summary: Compile time errors on Solaris 2.5.1 and SunOS
+SEVERITY:    bug 
+SPEC FIX:     N/A
+SOURCE FIX:   Solaris fixed by renaming lineitem field from extended to
+				  eprice; SunOS problem documented in Porting.Notes
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST:  1.1.0D
+OPENED BY:       jms@informix.com
+CLOSED IN:       1.2.0
+CLOSED BY:       jms@informix.com
+
+Problem #30: closed
+Summary: Cryptic comments in dists.dss
+SEVERITY:     flaw
+SPEC FIX:     N/A
+SOURCE FIX:   Cleaned up the comments in the file
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST:  1.2.0
+OPENED BY:       francois@sizing.com
+CLOSED IN:       1.2.3 ALPHA 1
+CLOSED BY:       jms@informix.com
+
+Problem #31: closed
+Summary: Inconsistant handling of fopen() failures
+SEVERITY:     bug
+SPEC FIX:     N/A
+SOURCE FIX:   introduced OPEN_CHECK macro (defined in dss.h)
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST:  1.2.0
+OPENED BY:       schiefer@ca.ibm.com
+CLOSED IN:       1.3.0
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #32: closed
+Summary: Path separators were hard-coded
+SEVERITY:     bug
+SPEC FIX:     N/A
+SOURCE FIX:   introduced PATH_SEP in config.h
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST:  1.2.0
+OPENED BY:       
+CLOSED IN:       1.3.0
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #33: OPEN
+Summary: Parallel load doesn't work under NT
+SEVERITY:     bug
+SPEC FIX:     N/A
+SOURCE FIX:   
+ANSWER SETS:  N/A
+WORKAROUND:   use -S option to build each step independently
+HELP NEEDED:  none
+AUDITORS NOTIFIED:	N/A
+OPENED AGAINST:  1.1.0
+OPENED BY:       
+CLOSED IN:       
+CLOSED BY:       
+
+Problem #34: closed
+Summary: P_NAME not properly populated
+SEVERITY:     bug
+SPEC FIX:     N/A
+SOURCE FIX:   Corrected color selection logic in agg_str()
+ANSWER SETS:  NFI for 1.x since it effect answer sets
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.2.3
+OPENED BY:       schiefer@ca.ibm.com
+CLOSED IN:       2.0.0
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #35: closed
+Summary: mk_sparse() returning bad orderkeys
+SEVERITY:     bug
+SPEC FIX:     N/A
+SOURCE FIX:   corrected logic in mk_sparse() and bcd2_bin()
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.3.0    
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       1.3.1
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #36: closed
+Summary: a_rnd() doesn't mask properly, uses small 'alphabet'
+SEVERITY:     bug
+SPEC FIX:     Corrected 4.2.2.6 to reflect 64 character set
+SOURCE FIX:   changed mask in a_rnd() from 067 to 077
+ANSWER SETS:  NFI for 1.x since answers would be effected
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.2.3
+OPENED BY:       pek@elsegundoca.ncr.com
+CLOSED IN:       2.0.0
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #37: OPEN
+Summary: need way to validate DBGEN without large storage requriement
+SEVERITY:     Feature Request
+SPEC FIX:     N/A
+SOURCE FIX:   Provide vrf_xxx routine to generate checksums
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.2.3
+OPENED BY:       jms@gradientsystems.com
+CLOSED IN:       
+CLOSED BY:       
+
+Problem #38: closed
+Summary: need to be able to generate specific update set
+SEVERITY:     Feature Request
+SPEC FIX:     N/A
+SOURCE FIX:   Update update generation to use -S <n> option
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.3.1
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       2.0.0 (not certain of fix version)
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #39: closed
+Summary: README for dbgen is out of date
+SEVERITY:     Documentation error
+SPEC FIX:     N/A
+SOURCE FIX:   Rewrite of README
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  2.0.0.6b
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       2.0.0
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #40: closed
+Summary: O_CUSTKEY is generated out of range at 10GB
+SEVERITY:     Bug
+SPEC FIX:     N/A
+SOURCE FIX:   Correction of CUST_MORTALITY calculation
+ANSWER SETS:  Unknown
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  2.0.0.7
+OPENED BY:       wayne.smith@intel.com
+CLOSED IN:       2.0.0.8
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #41: closed
+Summary: V2 appears slower than V1
+SEVERITY:     Bug
+SPEC FIX:    
+SOURCE FIX:   Used NthElement() in row_stop()
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  2.0.0.8
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       2.01a
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #42: closed
+Summary: Dual declaration of articles causes C++ compilation error
+SEVERITY:     Bug
+SPEC FIX:     N/A
+SOURCE FIX:   Duplicate declaration removed
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  2.0.0
+OPENED BY:       jpm@informix.com
+CLOSED IN:       2.0.0a
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #43: closed
+Summary: Subselect wild card not consistant with spec
+SEVERITY:     Bug
+SPEC FIX:     N/A
+SOURCE FIX:   Query templates corrected
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  2.0.0
+OPENED BY:       jpm@informix.com
+CLOSED IN:       2.0.0a
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #44: closed
+Summary: small money values incorrect
+SEVERITY:     Bug
+SPEC FIX:     N/A
+SOURCE FIX:   reworked PR_xxx macros
+ANSWER SETS:  new answer included for Q22
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  2.0.0
+OPENED BY:       ac4@elsegundoca.ncr.com
+CLOSED IN:       1.0.1
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #45: closed
+Summary: L_ORDERKEY/O_ORDERKEY incorrect
+SEVERITY:     Bug
+SPEC FIX:     N/A
+SOURCE FIX:   corrected pointer arithmetic in print.c
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.0.1
+OPENED BY:       jpm@informix.com
+CLOSED IN:       1.0.1a
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #46: closed
+Summary: L_ORDERKEY/O_ORDERKEY incorrect
+SEVERITY:     Dup (see #45)
+SPEC FIX:     N/A
+SOURCE FIX:   N/A
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.0.1
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       1.0.1a
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #47: closed
+Summary: QGEN parameter substitution not random
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   corrected varsub RANDOM usage to reflect seed file removal
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.0.1
+OPENED BY:       wayne.smith@intel.com
+CLOSED IN:       1.0.1a
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #48: closed
+Summary: QGEN parameter substitution not random for Q21
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   corrected varsub to only reference nations2 distribution
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.0.1a
+OPENED BY:       wayne.smith@intel.com
+CLOSED IN:       1.0.1b
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #49: closed
+Summary: Extraneous trailing separator in delete files 
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   special-cased the handling of deletes using PR_KEY
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.0.1b
+OPENED BY:       wayne.smith@intel.com
+CLOSED IN:       1.0.1c
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #50: closed
+Summary: qgen not generating valid parameter log files for defaults
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   corrected params/default reference
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.0.1
+OPENED BY:       cta@elsegundoca.ncr.com
+CLOSED IN:       1.0.1d
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #51: closed
+Summary: inconistent/invariant substitutions in Q16, Q17, Q19
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   corrected "brand" selection to make order irrelevent
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  1.0.1
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       1.0.1d
+CLOSED BY:       jms@gradientsystems.com
+
+Problem #52: closed
+Summary: qgen seeds make parameter substitutions position dependant
+	The current scheme uses an individual RNG stream for each query, and seeds
+	all streams identically. Accordingly, two queries that use the same domain
+	for the same parameter will always have the same value (e.g., q9 and q20).
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   seed the individual streams with the sequence of random 
+				  numbers produced by the global seed value
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990708
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       1.0.1a/1.1.0a (990727)
+CLOSED BY:       jms@gradientsystems.com
+CHECKED BY:      qa52
+
+Problem #53: closed
+Summary: number of lineitems in update files no longer varies
+	The RNG is not being set at the start of update generation; accordingly
+	the original data (including rowcounts) is being "regenerated"
+SEVERITY:     
+SPEC FIX:     N/A
+SOURCE FIX:   
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990708
+OPENED BY:       jennc@ca.ibm.com
+CLOSED IN:       990810
+CLOSED BY:       jms@gradientsystems.com
+CHECKED BY:      qa53
+
+Problem #54: closed
+Summary: segmented update files fail when rows per file is small
+	A round off error could cause the wrong number of rows to be output to a
+	given update file
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   correction to driver.c and print.c to use division and modulo
+				  to produce comparably sized files regardless of divisor
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990708
+OPENED BY:       v-larryk@microsoft.com
+CLOSED IN:       1.0.1a/1.1.0a (990727)
+CLOSED BY:       jms@gradientsystems.com (using code from larry)
+CHECKED BY:      
+
+Problem #55: closed
+Summary: -S <n> generates bad data when used with updates
+	The RNG is not being properly set
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   added the appropriate offset to the RNG, and simplified the
+				  update generation code
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990708
+OPENED BY:       jms@gradientsystems.com
+CLOSED IN:       990816
+CLOSED BY:       jms@gradientsystems.com (using code from larry)
+CHECKED BY:      qa55
+
+Problem #56: closed
+Summary: Need way to specify dists.dss location on the command line
+SEVERITY:     FEATURE
+SPEC FIX:     N/A
+SOURCE FIX:   added -b switch to driver.c and qgen.c
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990708
+OPENED BY:       clevine@microsoft.com
+CLOSED IN:       990830
+CLOSED BY:       jms@gradientsystems.com
+CHECKED BY:      N/A
+
+Problem #57: closed
+Summary: Need way to remove all DBGEN output unless there is an error 
+SEVERITY:     FEATURE
+SPEC FIX:     N/A
+SOURCE FIX:   added -q switch to driver.c and changed verbose if's
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990708
+OPENED BY:       clevine@microsoft.com
+CLOSED IN:       990830
+CLOSED BY:       jms@gradientsystems.com
+CHECKED BY:      N/A
+
+Problem #00058: OPEN
+Summary: Need way to track changes from one release to the next
+SEVERITY:     FEATURE
+SPEC FIX:     N/A
+SOURCE FIX:   reintroduce and automate the CHANGES file. Require MRs for
+			  all source code changes
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  None
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990708
+OPENED BY:       mpoess@us.oracle.com
+CLOSED IN:       
+CLOSED BY:       
+CHECKED BY:     
+
+Problem #00059: closed
+Summary:      extra comma in Q2 template
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   Template corrected
+ANSWER SETS:  N/A
+WORKAROUND:   None.
+HELP NEEDED:  None.
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990830
+OPENED BY:    jpm@informix.com
+CLOSED ON:    990908
+CLOSED BY:    jms@gradientsystems.com
+CHECKED BY:   N/A
+
+
+Problem #00060: closed
+Summary:      segmented inserts/deletes creating an extra file
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   Adding in missed change from original roll-in
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990830
+OPENED BY:    larryk@microsoft.com
+CLOSED ON:    990111
+CLOSED BY:    jms@gradientsystems.com
+CHECKED BY:   N/A
+
+Problem #00061: closed
+Summary:      64-bit support under DigUnix leads to math errors
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   Calculation of dRange in rnd.c now uses double cast
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  990830
+OPENED BY:    nramesh@us.oracle.com
+CLOSED ON:    000131
+CLOSED BY:    jms@gradientsystems.com
+CHECKED BY:   N/A
+
+Problem #00062: closed
+Summary:      bad update rollover after 1000 refreshes
+    This test uses tpcH scale 0.01. We've encountered
+	an situation in which dbgen doesn't generate
+	the correct data for delete files delete.1000 and
+	above. In particular, file delete.1000 contains
+	keys to be deleted that have never been loaded.
+	Because of this problem, keys that should have been
+	deleted never are causing duplicate unique values
+	to appear in the incremental loads after we cycle
+	from the 4000th incremental update back around starting
+	again with the 1st one.
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   N/A
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  19991101
+OPENED BY:    Roger.McNicol@sybase.com
+CLOSED ON:    20000509
+CLOSED BY:    jms
+CHECKED BY:   N/A
+
+Problem #00063: closed
+Summary:      update copyright notice
+    N/A
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   N/A
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  N/A
+OPENED BY:    jms@gradientsystems.com
+CLOSED ON:    20000131
+CLOSED BY:    jms@gradientsystems.com
+CHECKED BY:   N/A
+
+Problem #00064: closed
+Summary:      permute() introduce 0 selection in [1..50] for q16
+    N/A
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   rework permute() to be 1-based
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  20000413
+OPENED BY:    lorna@permetrics.com
+CLOSED ON:    20000414
+CLOSED BY:    jms@gradientsystems.com
+CHECKED BY:   N/A
+
+Problem #00065: OPEN
+Summary:      permute correction caused dataset changes
+    initial fix for #64 caused qa failures due to data set changes. New fix
+	is limited to query parameter substitution changes and has passed qa
+SEVERITY:     BUG
+SPEC FIX:     N/A
+SOURCE FIX:   N/A
+ANSWER SETS:  N/A
+WORKAROUND:   N/A
+HELP NEEDED:  N/A
+AUDITORS NOTIFIED: N/A	
+OPENED AGAINST:  20000511
+OPENED BY:    jms
+CLOSED ON:    N/A
+CLOSED BY:    N/A
+CHECKED BY:   N/A
diff --git a/data/ssb/dbgen/CHANGES b/data/ssb/dbgen/CHANGES
new file mode 100644
index 0000000..e7d1247
--- /dev/null
+++ b/data/ssb/dbgen/CHANGES
@@ -0,0 +1,33 @@
+# @(#)CHANGES	2.1.8.18
+08 Dec 1998	Release 2.0.0 
+15 Mar 1998	Release 2.0.0 pre-release
+06 Feb 1998	Release 1.3.1
+15 Dec 1996	Release 1.2.0
+08 Aug 1996	Release 1.1.0D
+01 May 1996	Release 1.1.0C
+29 Jan 1996	Release 1.1.0B
+23 Jan 1996	Release 1.1.0A
+19 Dec 1995	Release 1.1.0
+11 Sep 1995	Release 1.0.1
+13 Mar 1995	Release 1.0
+
+
+Changes between 990830 and 991011
+File    Bug ID
+----	------
+s.2.sql			00059	Removing extra comma
+s.dss.h			00061
+s.config.h		00061
+s.driver.c		00060	adding missed change from Larry
+s.makefile		00058
+s.rnd.c			00061
+s.HISTORY		00061
+s.history.html	00061
+s.mr.sh			00058	miscelaneous corrections
+s.bug.template	00058	removing extraneous spaces
+s.bug.template			changed titles
+
+
+Changes between 199910 and 000511
+File     Bug ID
+----     ------
diff --git a/data/ssb/dbgen/HISTORY b/data/ssb/dbgen/HISTORY
new file mode 100644
index 0000000..8258af8
--- /dev/null
+++ b/data/ssb/dbgen/HISTORY
@@ -0,0 +1,535 @@
+# @(#)HISTORY	2.1.8.3
+Changes as of 10/11/99
+   -- versions: TPCH 1.2.0a, TPCR 1.1.0a
+   -- Correction to segmented updates that was causing extra file to be 
+	  generated
+   -- Porting changes for DigUnix
+Changes as of 08/28/99
+   -- versions: TPCH 1.2.0, TPCR 1.1.0
+   -- reduced parameter substitution range for Q18
+   -- added new option to specify location of dists file (-b)
+   -- added DBGEN option to suppress all output (-q)
+Changes as of 08/16/99
+   -- versions: TPCH 1.1.0a, TPCR 1.0.1e
+   -- prevent "reuse" of original data in update files
+   -- correction to lint target in makefile.suite
+   -- removal of vestigal l_partkey predicate from 21.sql
+   -- reorder lineitem/order join in q5
+   -- removal of table aliases from 2.sql
+   -- randomize seeding of qgen RNG to close bug 52
+   -- correct possible round off error in segmented update files
+   -- corrected soft copy answer set for Q22
+   -- corrected percision of answer set for Q19
+Changes as of 07/08/99
+   -- versions: TPCH 1.1.0, TPCR 1.0.1
+   -- WORKLOAD must be set to either TPCH or TPCR in the makefile
+   -- unneeded reference to part table removed from q21 template
+Changes as of 06/04/99
+   -- version 1.0.1d
+   -- Restarted version numbering to match specification revisions for
+	  TPC-H and TPC-R
+   -- Corrected answer set for for Q13
+   -- Corrected parameter substitutions for Q16, Q17, Q19, Q20, Q21, Q22
+   -- Corrected RNG initialization in qgen.c
+   -- added adhoc.c adhoc.h to code base to support randomized data sets;
+	  currently disabled
+   -- replaced calls to UnifInt() row_stop with call to NthElement()
+   -- Corrected a problem that caused small negative money values to print as 
+      a positive value
+   -- Simplication of PR_xxx macros
+   -- QGEN building correct parameter logs again
+
+******************
+* NOTE NOTE NOTE *
+******************
+Below this line the file refers to TPC-D which was retired in favor of 
+TPC-H and TPC-R. Since the new speicifications are numbered  from 1.0.0
+the program version was reset.
+******************
+* NOTE NOTE NOTE *
+******************
+
+Changes as of 01/05/99
+   -- version 2.0.1
+   -- added 1999 to the copyright notice
+   -- corrected C++ compilation problem
+   -- sub-select phrasing corrected in Q4, Q21, Q22
+   -- added support for segmenting update files (contributed by Larry Kemp, HP)
+Changes as of 12/08/98
+   -- version 2.0.0
+   -- removed permute.h from clean target in makefile
+Changes as of 11/17/98
+   -- version 2.0.0 Alpha 8
+   -- corrected o_custkey overrun bug
+   -- removed upper bound on -C command option
+   -- added static permute.h to distribution to match the specification
+Changes as of 10/23/98
+   -- version 2.0.0 Alpha 7
+   -- removed references to DSS_SEED and SEED_TAG
+   -- minor query template cleanup
+   -- V2 answer sets added
+   -- correction to hd_sparse for SF > 300
+   -- added static declaration to row types in gen_tbl to fix update problem
+   -- permuted params to Q22
+Changes as of 5/19/98
+   -- version 2.0.0 Alpha6b
+   -- removed trailing apostrophe from dists.dss nouns for Tandem loader
+   -- corrected mk_sparse() problem with alpha6
+   -- added 64b support for NCR/Metaware
+   -- corrected revision problem with 2.0.0.6
+Changes as of 5/7/98
+   -- version 2.0.0 Alpha6
+   -- corrected generation of parent/child tables in parallel
+   -- renamed ORDER table to ORDERS table
+   -- revision of DBGEN synced with revision of 2.0 specification
+   -- portability changes to process termination provided by John Matzka
+   -- portability changes for Watcom C provided by Andrew Eisenberg
+   -- indentation of specifications/templates now matches
+   -- queries now include a consistant header format
+Changes as of 4/28/98
+   -- version 2.0.0 Alpha5
+   -- NO RELEASE OF ALPHA 5 ; skipped to sync spec/DBGEN revision levels
+Changes as of 4/6/98
+   -- version 2.0.0 Alpha4
+   -- corrected parallel table generation
+   -- minor corrections to query templates
+   -- portability changes for HP
+Changes as of 3/24/98
+   -- version 2.0.0 Alpha3
+   -- include substitution parameters for Q22
+   -- correct substitution parameters for Q16 under AIX
+   -- include permute.h until unix/NT makefile fix
+   -- correct orderkey generation
+Changes as of 3/20/98
+   -- version 2.0.0 Alpha2
+   -- correct runtime malloc error from bad INIT_HUGE macro
+   -- improve pseudo text distribution in comments
+   -- fix problem with parallelism of data gen
+   -- re-enable generation of parent/child tables
+   -- remove recombinaton code for parallel flat files
+Changes as of 3/11/98
+   -- version 2.0.0 Alpha1
+   -- removed the TIME table
+   -- removed the need for seed files
+   -- made 1GB the validation database size
+   -- add pseudo text support in comments
+   -- correct character selection in a_rnd()
+   -- correct population of P_NAME
+   -- removed unclaimed variants
+   -- added new queries 18-22, replaced Q13
+Changes as of 2/6/98
+   -- version 1.3.1
+   -- Revised 64 bit support to clean up bcd2_bin()and mk_sparse()
+   -- Add 64b support for NT
+Changes as of 12/31/97
+   -- version 1.3.0
+   -- support for seed generation > 1TB (data gen still to be tested)
+   -- rework of 64b support
+   -- added bcd support for subtraction, comparison, modulo
+   -- added 1998 to the copyright notice
+   -- clarified comments in dists.dss
+   -- corrected substitution problem in Q11
+   -- standardized fopen() error messages with OPEN_CHECK()
+   -- introduced PATH_SEP in config.h to allow changes in path separators
+Changes as of 12/15/96
+   -- version 1.2.0
+   -- corrected typos in queries 8a, 8c, 8d, 11a, 12F and 14F, 17a
+   -- added variant 15c
+   -- defined MAX_SCALE and MIN_SCALE; issued error messages for SF > 1000
+         since implementation is incomplete
+   -- seed file generation can now be resumed with dbgen -R <n> ...
+   -- corrected slight compile bug under Solaris 2.5.1
+   -- documented compile problems under SunOS
+Changes as of 8/1/96
+   -- version 1.1.0D
+   -- included new variants for queries 8 and 15
+   -- re-introduced answer sets in the source tree
+Changes as of 5/1/96
+   -- version 1.1.0C
+   -- unified version numbering of DBGEN and QGEN
+   -- updated BUGS list
+   -- removed FAQ from soft appendix; web site will keep the current 
+           version of the FAQ
+   -- added 1996 to the copyright notice
+   -- corrected bug in PR_DATE macro; NO CHANGE TO DATA SET
+   -- properly initialize param values for cleaner logging
+   -- adjusted output format of Q11 partam to allow scaling to 1TB
+   -- corrected typos in variant 14c
+   -- corrected data type for YEAR in variant 8c
+   -- corrected typos in variant 10a
+   -- added variant 8d
+Changes as of 1/23/96
+   -- qgen version 1.1.0B
+   -- include support for ANSI semantics
+   -- improved patch for seed sensetivity
+Changes as of 1/23/96
+   -- updated BUGS list
+   -- dbgen version 1.1.0A
+   -- patch to limit BCD2 fields to 12 characters for columnar output
+   -- qgen version 1.1.0A
+   -- patch to fix the "unknown flag" problem
+   -- patch to fix the seed sensetivity problem
+Changes as of 12/19/95
+   -- updated BUGS list
+   -- dbgen version 1.1.0
+   -- upped default value of MAX_CHILDREN to 1000
+   -- corrected naming of detail tables in incremental load
+   -- corrected range delete output
+   -- forced delete files to truncate existing files
+   -- removed fixed size tables from seed generation
+   -- corrected overflow problem with large scale seed generation
+   -- allow date generation as MM-DD-YY based on config.h #define
+   -- correct truncation problem with columnar output in PR_VSTR()
+   -- added support for Windows NT
+   -- added PLATFORM macro to makefile, removed platform defines from
+           config.h
+   -- removed MAX_CHILDREN define from config.h (set to 1000 in dss.h)
+   -- qgen version 1.1.0
+   -- correct SET_OUTPUT macro to TDAT
+   -- use %ld in output for q17; portability
+   -- add support for SQLSERVER database dialect
+   -- add support for SYBASE database dialect
+   -- adjust parameter ranges for Q1, Q3, Q6
+   -- add -T/-t option to usage summary
+   -- added support for Windows NT
+Changes as of 09/01/95
+   -- qgen version 1.0.1
+   -- formalized version numbering 
+   -- -p now generates correct query permutations
+   -- added separate verion number for qgen
+   -- corrected Q3 substitution problem
+   -- updated permissible range for Q10 
+   -- corrected rowcount_dflt and the MAX row indicator (-1)
+   -- expanded param logging to include all possible parameters
+   -- allowed qgen's -d option to be used at all scale factors
+   -- made parameter substitution permutation-independent
+   -- added qgen suppport for END_TRAN (-E) and DFLT_NUM (-N)
+   -- correct handling of :n directive
+   -- added more complete explanation of QGEN to README
+   -- rename of random to rndm, for portability
+   -- dbgen version 1.0.1
+   -- formalized version numbering 
+   -- inclusion of SF=1 seed file
+   -- correct typo in usage() update example
+   -- patch to driver.c to allow correct updates
+   -- documentation change to README to clarify seed/stage/update
+           intereaction
+   -- corrected minor glitch in "open failed" error msg in print.c
+   -- added missing line continuation to makefile.suite
+   -- seed files are now based on scale factor and number of generators
+   -- seed files now hold seeds for one "step" of a given build
+   -- clean up of parallel load routines
+   -- inclusion of faster seed generation routines from Susanne Englert
+   -- removed the -E(xisting) option
+   -- assure proper scaling of O_CUSTKEY
+   -- corrected default update percentage
+   -- proper handling of child tables with '-O f'
+   -- removed seed files from the distribution
+   -- modified rpb_routine() to limit contribution of partkey in 
+           retailprice
+   -- added '-S(tep)' option to allow multi-stage loads
+   -- roll in of 32 bit speed_seed routines from Dick Shelton
+   -- miscelaneous typo corrections in the documentation
+   -- cleanup of usage output
+Changes as of 05/08/95
+   -- version 1.0
+   -- add Teradata defines to tpcd.h for QGEN
+   -- add :c to query templates for database CONNECT syntax
+   -- add examples of DBGEN and QGEN usage to README
+   -- add -T option to qgen to allow time able usage
+   -- query template names only requre .sql suffix, rest is arbitrary
+Changes as of 03/13/95
+   -- version 9.1
+   -- surround DBNAME with ifndef in config.h
+   -- remove -DDBNAME from makefile.suite
+   -- sync varchar handling with 9.1 draft
+Changes as of 02/21/95
+   -- version 9.0a
+   -- fixed bug in qgen that incorrectly included rnd.h
+   -- included revised DDL with changes for char/varchar and l_quantity
+   -- updated DBGEN help message to include new single table options for
+           order/lineitem and part/partsupp
+   -- included handling for multi-set seed files TPCDSEED.xxx
+   -- generated seeds up through 400GB; headed to 1TB!
+   -- ANSI lint cleanup; more needed
+   -- UF2 now defaults to key lists; use "-O r" to generate key ranges
+           also note, this routine this routine does NOT use the BCD2_* 
+           routines. As a result, it WILL fail if the keys being deleted 
+           exceed 32 bits. Since this would require ~660 update iterations, 
+           this seems an acceptable oversight
+Changes as of 01/19/95
+   -- version 9.0
+   -- allowed command line seeding of RNG for QGEN
+   -- order and number of params in QGEN now matches 
+         presentation in spec
+   -- fixed bug in time table format of O_ORDERDATE
+   -- changed l_QUANTITY to FLOAT in dss.ddl
+   -- reworked QGEN options to be more useful
+   -- allowed creation of sparse keys beyond 32 bits (for 1TB)
+   -- removed unused '#ifdef' and associated code
+   -- allowed independent generation of master/detail tables 
+           (eg, order/lineitem)
+Changes as of 12/06/94
+   -- version 8.6
+   -- fixed renaming of flat files for child tables
+   -- various documentation fixes
+   -- added naming convention section to Porting.Notes
+   -- added -DIBM flag to config.h
+   -- synced up QGEN with draft 8.1
+Changes as of 10/25/94
+   -- version 8.5a
+   -- corrected bug in columnar output of pr_supp
+   -- added pr_drange to generate a list of order keys to be 
+           deleted instead of generating SQL
+   -- added '-O d' to generate range delete as SQL
+   -- updated default values for QGEN to sync with spec 8.1
+   -- corrected MK_SPARSE to reflect groups of 8
+   -- corrected a bug in o_orderstatus
+   -- regenerated seed files for SF in [1,10]
+   -- ANSI cleanup (primarily function declarations)
+Changes as of 10/11/94
+   -- version 8.5
+   -- remove deletes/inserts to other than order/lineitem
+   -- increased cardinality for part.type part.container
+   -- '-r' argument is now integer; percentage in basis points
+   -- initial roll-in of new update scheme
+   -- added BBB comments to supplier table
+Changes as of 9/27/94
+   -- version 8.4
+   -- all money calculations now use integer math. This should 
+           bring everyone's data sets into exact aggreement.
+Changes as of 9/21/94
+   -- version 8.3b
+   -- fixed handling of MAX_STREAM
+   -- added floor function to RPRICE bridge
+   -- misc lint cleanup (type fixes, new prototypes, etc.)
+   -- MONEY format becomes lf for DOS
+   -- further cleanup of PR_VSTR and its length argument
+   -- change to parameter generation for Q6 to allow for float 
+           discount
+Changes as of 9/15/94
+   -- version 8.3a
+   -- isolated MONEY format for Unisys (Lf) using DOS
+   -- make sure all arguments to MAKE_MONEY were double's
+   -- rolled in NEW_PTEXT to allow Berni to experiment
+Changes as of 9/12/94
+   -- version 8.3
+   -- added -T n and -T r to usage to match getopt() and README
+   -- changed PR_MONEY to remove leading blanks
+   -- included revised DDL from Berni
+   -- included some MVS portability fixes in re malloc.h
+   -- cleaned up error messages in qgen and made #define ofp usage
+           universal
+   -- additional DOS portability changes
+   -- added {c,a}len to provide specific length for columnar 
+           output of varchar
+   -- added PR_VSTR to handle varchar printing under MVS
+   -- fixed bit masking in a_rnd and cleaned up prototype match 
+           with V_STR
+   -- PR_MONEY now used %Lf
+   -- added revised pseudo text under NEW_PTEXT ifdef for 
+           experiments
+Changes as of 9/09/94
+   -- version 8.2
+   -- l_discount and l_tax are now fractional (per teleconference)
+   -- money calculations moved to scaled integer math to clean up 
+           answer sets
+   -- changed PR_FLT() to PR_MONEY to clarify usage
+   -- portability changes for SYBASE: dbname --> db_name
+           STATUS --> DBGEN_STATUS
+   -- added nations2 to dists.dss to handle qgen needs for now
+   -- reintroduced #ifndef DOS
+   -- reintroduced U2200 define to control kill_load()
+   -- broke out nation and region separately in -T option
+   -- updated dss.ddl based on mail from Berni
+Changes as of 8/31/94
+   -- version 8.1
+   -- scaling for clerks needed to be 1000 (was 100)
+   -- added qgen parameter for scale
+   -- changed qgen parameter from s)tream to p)ermutation
+   -- synced qgen paramter values with 8.0 spec
+   -- corrected duplications in dists.dss
+Changes as of 8/24/94
+   -- version 8.0
+   -- added sparse keys to lineitem/order
+   -- added varchar generation for comments/addresses
+   -- added variable lineitems/orders
+   -- removed ifdef for normalized code_tables
+   -- included code for parameter generation and template->EQT 
+           routines
+   -- updated README and Porting.Notes to reflect QGEN
+   -- included DDL and RI examples from Berni
+Changes as of 6/15/94
+   -- version 7.0b (numbers now match spec revsion)
+   -- rework of code tables to properly map nation/region; when 
+           compiled with -DCODE_TABLES distributions are taken from 
+           code.dss and two additional fields are generated for 
+           customers and suppliers, [cs]_ncode and [cs]_rcode, 
+           immediately following [cs]_region
+   -- replaced ifdef's around DEAD_DATA with opposites. DEAD_DATA 
+           is now the default
+   -- worked through code to see that it conformed to 7.0 
+           specification
+   -- adjusted scale factors/rowcounts for 1 GB == sf1
+   -- brought help message in line with current code
+   -- fixed order per customer at 10
+   -- make suppkey scalable in lineitem/partsupp
+Changes as of 4/25/94
+   -- version 1.5
+   -- added the customers with no orders; Compile with -DDEAD_DATA 
+           to activate the change.
+   -- added the code table for nation and region; 
+           Compile with -DCODE_TABLES to activate the change.
+Changes as of 3/17/94
+   -- version 1.41
+   -- completed implementation of JULIAN_DAY after talks with Berni
+   -- misc cleanup in usage/README files
+   -- removed all tabs and capped line length at 75
+   -- added -n option to allowing naming of inline-loaded database
+Changes as of 3/16/94
+   -- version 1.4
+   -- prottyped julian day/month for query re-write work. Compile 
+           with -DJULIAN_DAY to enable
+   -- removed gen_times() from driver.c
+   -- added VMS ifdef to config.h to clean up fork/signal issues
+   -- added ICL ifdef to config.h to clean up getopt() issues
+   -- changed header file references to config.h from machine.h
+Changes as of 3/2/94
+   -- version 1.31
+   -- corrected format of C_NAME to match S_NAME and O_CLERK
+   -- re-allowed fractional scale factors < 1 (updates not 
+           contiguous)
+   -- added DSS_CONFIG environemnt variable
+   -- reworked read_dist() to look for DSS_DIST in DSS_CONFIG
+   -- updated the README file
+Changes as of 2/16/94
+   -- version 1.3
+   -- added command line options for parallel load and data set 
+           expansion
+   -- changed dists.dss delimiter to | for portability
+   -- limited scale factors to integer values
+   -- added command line option for seed file generation
+   -- added all seed files to distribution for SFs 1 - 10
+   -- moved machine.h to config.h and added MAX_CHILDREN define
+   -- added 'f' flag to options to allow renaming of output files
+   -- added generation of SQL delete statements to match updates
+           (Note: updates are still single-threaded; -C is cleared 
+           by -U)
+   -- corrected field sizing in dsstypes.h typedefs to match v 6.4
+   -- update percentage default set to 1%
+Changes as of 12/3/93
+   -- version 1.2
+   -- added command line option to adjust update percentage
+   -- fixed update gneration for proper primary key ordering
+   -- renamed UUSR/PRC to RUSSIA/CHINA in dists.dss
+   -- cleaned up phone number generation to be consistant regard-
+           less of order of evaluation
+   -- adjusted size of lineitem comment to bring data in line with 
+           100 MB == SF=1
+Changes as of 10/15/93
+   -- added command line option for update data creation
+   -- miscelaneous porting and cleanup changes
+   -- reworked table generation to allow reuse for updates
+   -- added comment field to tdefs structure
+   -- added load_state and store_state to sync data gen and 
+           update gen
+Changes as of 7/26/93
+   -- combined loader and header stubs in load_stubs.c
+   -- separated Revision History (this file) from README
+   -- simplified makefile
+   -- removed redundancies from colors distribution
+   -- added getopt() for portability
+   -- created Porting.Notes
+   -- adjusted scaling rules
+   -- added help option to the command line
+Changes as of 2/26/93
+   -- combined all typedefs in one header: dsstypes.h
+   -- combined flat file generation in print.ec
+   -- combined typedef population in build.ec
+   -- added -P to control rowcnt scaling (P for percentage)
+   -- added -D option for Direct data generation and added 
+           appropriate hooks in tdefs[] structure
+   -- added -F option for flat file generation
+   -- reused -T option (use -P 0.1 to build test size database)
+           now accepts suboptions c,o,p,s for single table builds.
+   -- dropped -M option (scaling is now by rowcount)
+   -- added -O option for optional controls. Currently defined:
+           -O t -- generate optional time table a join fields in 
+                   order/lineitem
+           -O h -- generate headers for flat file output
+           -O m -- generate fixed column-length output
+   -- removed dynamic memory allocation, redundant calls to 
+           UnifInt, etc to improve performance
+Changes as of 1/12/92
+   -- julian() changed to handle orders->orderdate correctly
+   -- rflag distributions corrected in dists.dss
+   -- sea, gold removed from color distribution to clean up substring 
+      problems
+   -- part->number and supplier-> adjusted for 1-based indexing
+   -- time->day changed to be day of month, not day of year
+   -- t.week changed to be week in year, not day of week
+Changes as of 11/18/92
+   -- checked line length and tab for transmission
+   -- another chapter in the portability wars. added #include 
+      "machine.h" to dss.h (which is included by everyone else). Any 
+      machine particular porting changes should go here.
+   -- fixed fixed-field formats to prevent double printing
+   -- expanded PR_FLT formats to %010.2
+Changes as of 10/21/92
+   -- added fixed format and column header handling; users of headers 
+      will have to define the header functions to be called in 
+      int (*tdefs.header)()
+Changes as of 10/09/92:
+   -- added ansi prototypes and recompiled with gcc -ansi. users may 
+      need to change the CC definition in the makefile and the contents 
+      of CFLAGS to reflect their particular ansi compiler.
+   -- replaced all int references with long
+   -- replaced all float references with double
+   -- found and fixed odate/julian problem TS mentioned in 10/09 phone 
+      call
+
+Changes as of 9/09/92:
+   -- Park/Miller random number generator included
+   -- clerk scaling changed to 100 * scale
+   -- parts.name always built from 5 selections from colors set
+   -- test scaling changed to ~60MB (TEST_SCALING == 10)
+   -- logarithmic scaling removed
+   -- mfgcost removed and retail/supplier cost bounds adjusted
+   -- agg_str memory leak fixed
+   -- independent RNG streams on a per column basis
+
+This is the revised data generator for DSS. 
+
+The rewrite tried to accomplish three things: (1) identify and isolate 
+all the implicit assumptions about limits, bounds, ranges, distribu-
+tions, etc.; (2) standardize the way any given table was generated/
+printed to ease understanding and maintenance; (3) bring the generator 
+in line with the current work of the committee and the excellent spec 
+the Indira put together; (4) provide an easy way to adjust distribu-
+tions, string contents and to facilitate experimentation to get a 
+better idea of the impact of data population changes.
+
+The files included are:
+
+driver.c       ------- main and the calling routines for the generators
+dist.c         ------- should really be named dss_util.c; misc routines
+customer.c     ------- generation and print routines for customer table
+orders.c       -------            ""             ""      order table
+parts.c        -------            ""             ""      parts/partsupp 
+suppliers.c    -------            ""             ""      suppliers table
+time.c         -------            ""             ""      time table
+customer.h     ------- associate header files; contain structure 
+                       definitions
+dss.h                  dss.h holds the large number of assumptions and
+orders.h               values that have been used as IFDEFs.
+parts.h  
+suppliers.h
+time.h   
+dists.dss   ------- string selections and weights; used to build 
+                    distributions
+
+Running make will create an executable (using the compiler flags in 
+CFLAGS, the ld flags in LDFLAGS and the libraries in LIBS [-O, -s, 
+and -lm by default]) which will create flat files suitable for dbload.
+t
+
diff --git a/data/ssb/dbgen/PORTING.NOTES b/data/ssb/dbgen/PORTING.NOTES
new file mode 100644
index 0000000..2916d20
--- /dev/null
+++ b/data/ssb/dbgen/PORTING.NOTES
@@ -0,0 +1,220 @@
+# @(#)PORTING.NOTES	2.1.8.1
+
+Table of Contents
+==================
+1. General Program Structure
+2. Naming Conventions and Variable Usage
+3. Porting Procedures
+4. Compilation Options
+5. Customizing QGEN
+6. Further Enhancements
+7. Known Porting Problems
+8. Reporting Problems
+
+1. General Program Structure
+
+The code provided with TPC-H and TPC-R benchmarks includes a database 
+population generator (DBGEN) and a query template translator(QGEN).  It 
+is written in ANSI-C, and is meant to be easily portable to a broad variety 
+of platforms. The program is composed of five source files and some 
+support and header files.  The main modules are:
+
+        build.c: each table in the database schema is represented by a 
+                 routine mk_XXXX, which populates a structure 
+                 representing one row in table XXXX.
+                 See Also: dss_types.h, bm_utils.c, rnd.*
+        print.c: each table in the database schema is represented by a 
+                 routine pr_XXXX, which prints the contents of a 
+                 structure representing one row in table XXX.
+                 See Also: dss_types.h, dss.h
+        driver.c: this module contains the main control functions for 
+                 DBGEN, including command line parsing, distribution 
+                 management, database scaling and the calls to mk_XXXX 
+                 and pr_XXXX for each table generated.
+        qgen.c:  this module contains the main control functions for 
+                 QGEN, including query template parsing.
+        varsub.c: each query template includes one or more parameter
+                  substitution points; this routine handles the 
+                  parameter generation for the TPC-H/TPC-R benchmark.
+
+The support utilities provide a generalized set of functions for data 
+generation and include:
+
+        bm_utils.c: data type generators, string management and 
+                 portability routines.
+
+        rnd.*:  a general purpose random number generator used 
+                throughout the code.
+
+        dss.h:
+        shared.h: a set of '#defines' for limits, formats and fixed 
+                values 
+        dsstypes.h: structure definitions for each table definition
+
+2. Naming Conventions and Variable Usage
+
+Since DBGEN will be maintained by a large number of people, it is 
+particularly important to observe the coding, variable naming and usage
+conventions detailed here.
+
+  #define
+  --------
+  All #define directives are found in header files (*.h).  In general,
+  the header files segregate variables and macros as follows:
+	rnd.h -- anything exclusively referenced by rnd.c
+	dss.h -- general defines for the benchmark, including *all*
+	    extern declarations (see below).
+	shared.h -- defines related to the tuple definitions in
+	    dsstypes.h. Isolated to ease automatic processing needed by many
+	    direct load routines (see below).
+	dsstypes.h -- structure definitons and typedef directives to
+	    detail the contents of each table's tuples.
+	config.h -- any porting and configuration related defines should
+	    go here, to localize the changes necessary to move the suite
+	    from one machine to another.
+	tpcd.h -- defines related to QGEN, rather than DBGEN
+
+    extern
+    ------
+    DBGEN and QGEN make extensive use of extern declarations. This could
+    probably stand to be changed at some point, but has made the rapid
+    turnaround of prototypes easier. In order to be sure that each
+    declaration was matched by exactly one definition per executatble,
+    they are all declared as EXTERN, a macro dependent on DECLARER. In
+    any module that defines DECLARER, all variables declared EXTERN will
+    be defined as globals. DECLARER should be declared only in modules
+    containing a main() routine.
+
+    Naming Conventions
+    ------------------
+    defines
+	o All defines use upper case
+	o All defines use a table prefix, if appropriate:
+		O_*	relates to orders table
+		L_*	realtes to lineitem table
+		P_*	realtes to part table
+		PS_*	relates to partsupplier table
+		C_*	realtes to customer table
+		S_*	relates to supplier table
+		N_*	relates to nation table
+		R_*	realtes to region table
+		T_*	relates to time table
+	o All defines have a usage prefix, if appropriate:
+		*_TAG	environment variable name
+		*_DFLT	environment variable default
+		*_MAX	upper bound
+		*_MIN	lower bound
+		*_LEN	average length
+		*_SD	random number seed (see rnd.*)
+		*_FMT	printf format string
+		*_SCL	divisor (for scaled arithmetic)
+		*_SIZE	tuple length
+
+3. Porting Procedures
+
+The code provided should be easily portable to any machine providing an 
+ANSI C compiler. 
+        -- Copy makefile.suite to makefile
+        -- Edit the makefile to match the name of your C compiler
+           and to include appropriate compilation options in the CFLAGS
+           definition
+        -- make. 
+
+Special care should be taken in modifying any of the monetary calcu-
+lations in DBGEN. These have proven to be particularly sensitive to 
+portability problems. If you decide to create the routines for inline 
+data load (see below), be sure to compare the resulting data to that 
+generated by a flat file data generation to be sure that all numeric 
+conversions have been correct.
+
+If the compile generates errors, refer to "Compilation Options", below.
+The problem you are encountering may already have been addressed in the
+code.
+
+If the compile is successful, but QGEN is not generating the appropriate
+query syntax for your environment, refer to "Customizing QGEN", below.
+
+For other problems, refer to "Reporting Problems" at the end of this
+document.
+
+4. Compilation Options
+
+config.h and makefile.suite contain a number of compile time options intended 
+to make the process of porting the code provided with TPC-H/TPC-R as easy as 
+possible  on a broad range of platforms. Most ports should consist of reviewing 
+the possible settings described in config.h and modifying the makefile
+to employ them appropriately.
+
+5. Customizing QGEN
+
+QGEN relies on a number of vendor-specific conventions to generate
+appropriate query syntax. These are controlled by #defines in tpcd.h,
+and enabled by a #define in config.h. If you find that the syntax
+generated by QGEN is not sufficient for your environment you will need
+to modify these to files. It is strongly recomended that you not change
+the general organization of the files. 
+
+Currently defined options are:
+
+VTAG            -- marks a variable substitution point [:]
+QDIR_TAG        -- environent variable which points to query templates
+                   [DSS_QUERY]
+GEN_QUERY_PLAN  -- syntax to generate a query plan ["Set Explain On;"]
+START_TRAN      -- syntax to begin a transaction ["Begin Work;"]
+END_TRAN        -- syntax to end a transaction ["Commit Work;"]
+SET_OUTPUT      -- syntax to redirect query output ["Output to"]
+SET_ROWCOUNT    -- syntax to set the number of rows returned 
+                   ["{return %d rows}"]
+SET_DBASE       -- syntax to connect to a database
+
+6. Further Enhancements
+
+load_stub.c provides entry points for two likely enhancements.
+
+The ld_XXXX routines make it possible to load the
+database directly from DBGEN without first writing the database
+population out to the filesystem.  This may prove particularly useful
+when loading larger database populations. Be particularly careful about
+monetary amounts. To assure portability, all monetary calcualtion are
+done using long integers (which hold money amounts as a number of
+pennies). These will need to be scaled to dollars and cents (by dividing
+by 100), before the values are presented to the DBMS.
+
+The hd_XXXX routines allow header information to be written before the
+creation of the flat files. This should allow system which require 
+formatting information in database load files to use DBGEN with only
+a small amount of custom code.
+
+qgen.c defines the translation table for query templates in the
+routine qsub().
+
+varsub.c defines the parameter substitutions in the routine varsub().
+
+If you are porting DBGEN to a machine that is not supports a native word
+size larger that 32 bits, you may wish to modify the default values for
+BITS_PER_LONG and MAX_LONG. These values are used in the generation of
+the sparse primary keys in the order and lineitem tables. The code has
+been structured to run on any machine supporting a 32 bit long, but
+may be slightly more efficient on machines that are able to make use of
+a larger native type.
+
+7. Known Porting Problems
+
+The current codeline will not compile under SunOS 4.1. Solaris 2.4 and later 
+are supported, and anyone wishing to use DBGEN on a Sun platform is 
+encouraged to use one of these OS releases.
+
+
+8. Reporting Problems
+
+The code provided with TPC-H/TPC-R has been written to be easily portable, 
+and has been tested on a wide variety of platforms, If you have any 
+trouble porting the code to your platform, please help us to correct 
+the problem in a later release by sending the following information 
+to the TPC D subcommittee:
+        
+        Computer Make and Model
+        Compiler Type and Revision Number
+        Brief Description of the problem
+        Suggested modification to correct the problem
+
diff --git a/data/ssb/dbgen/README b/data/ssb/dbgen/README
new file mode 100644
index 0000000..477bfb7
--- /dev/null
+++ b/data/ssb/dbgen/README
@@ -0,0 +1,88 @@
+Note: In our research paper we use the SSB instead of SSBM
+Version of 2/28/10: 
+Cardinality of supplier fixed to follow benchmark spec: now 2000*SF
+ (previously was 10000*SF, in error): line 226, driver.c
+Type of time value changed from long to time_t (now 64 bits on Windows):
+ line 688, build.c
+Building in Visual Studio 2008:
+  Use Win32 console project, not using precompiled headers,
+  in Properties>C/C++>CommandLine, additional options:
+   /D "SSBM" /D "DBNAME" /D "DB2"   (for DB2)
+Building using makefile_win:  set for DB2 build:
+  nmake -f makefile_win
+  (Change DATABASE symbol for other database)
+
+SSBM dbgen readme:
+
+SSBM is based on TPC-H dbgen source. The coding style and architecture
+follows the TPCH dbgen. The original TPCH dbgen code stays untouched and
+all new code related to SSBM dbgen follow the "#ifdef SSBM" statements.
+
+For original detailed TPC-H documentation, please refer TPCH_README 
+document under the same directory. Here we just list few things that 
+are specific to SSBM.
+
+  
+1. How is SSBM DBGEN built?
+
+Same idea as TPCH dbgen setup, which requires user to create an 
+appropriate makefile, using makefile.suite as a basis. Make sure to
+use "SSBM" for the workload variable. 
+
+Type "make" to compile and to generate the SSBM dbgen executable. 
+Please refer to Porting.Notes for more details and for
+suggested compile time options.
+
+Note: If you want to generate the data files to a diffent directory, you should
+copy the dbgen executable as well as the dists.dss file to that directory.
+ 
+2. How to generate SSBM data files?
+To generate the dimension tables:
+
+(customer.tbl)
+dbgen -s 1 -T c
+
+(part.tbl)
+dbgen -s 1 -T p
+
+(supplier.tbl)
+dbgen -s 1 -T s
+
+(date.tbl)
+dbgen -s 1 -T d
+
+(fact table lineorder.tbl)
+dbgen -s 1 -T l
+
+(for all SSBM tables)
+dbgen -s 1 -T a
+
+To generate the refresh (insert/delete) data set:
+(create delete.[1-4] and lineorder.tbl.u[1-4] with refreshing fact 0.05%)
+dbgen -s 1 -r 5 -U 4
+
+   where "-r 5" specifies refreshin fact n/10000
+         "-U 4" specifies 4 segments for deletes and inserts
+  
+At this moment there is no QGEN for SSBM. So
+the command line options related to those features won't apply.
+
+3. What are the changes upon TPC-H dbgen
+
+changes made upon original TPC-H dbgen
+
+1. removed snowflake tables such as nation and region (done)
+2. removed the partsupply table (done)
+3. removed the order table (done)
+4. renamed the fact table as Lineorder and added/removed many fields
+( done)
+5. added the date dimension table (done)
+6. adding and removing fields in dimension tables (done)
+7. have data cross reference for supplycost, revenue in lineorder (done)
+8. apply the refreshing only to lineorder table (done)
+
+The command line option keeps the same as TPC-H dbgen (The -T options
+are changed to reflect different set of tables)
+
+===================== End of README ========================================
+
diff --git a/data/ssb/dbgen/TPCH_README b/data/ssb/dbgen/TPCH_README
new file mode 100644
index 0000000..9c8225f
--- /dev/null
+++ b/data/ssb/dbgen/TPCH_README
@@ -0,0 +1,425 @@
+# @(#)README	2.1.8.1
+
+Table of Contents
+===================
+ 0. What is this document?
+ 1. What is DBGEN?
+ 2. What will DBGEN create?
+ 3. How is DBGEN built?
+ 4. Command Line Options for DBGEN
+ 5. Building Large Data Sets with DBGEN
+ 6. DBGEN limitations and compliant usage
+ 7. Sample DBGEN executions
+ 8. What is QGEN?
+ 9. What will QGEN create?
+10. How is QGEN built?
+11. Command Line Options for QGEN
+12. Query Template Syntax
+13. Sample QGEN executions and Query Templates
+14. Environment variable
+15. Version Numbering in DBGEN and QGEN
+
+0. What is this document?
+
+This is the general README file for DBGEN and QGEN, the data-
+base population and executable query text generation programs 
+used in the TPC-H and TPC-R benchmarks. It covers the proper use 
+of DBGEN and QGEN. For information on porting the utility to your 
+particular platform see Porting.Notes.
+
+1. What is DBGEN?
+
+DBGEN is a database population program for use with the TPC-H and 
+TPC-R benchmarks.  It is written in ANSI 'C' for portability, and has 
+been successfully ported to over a dozen different systems. While the 
+TPC-H and TPC-R specifications allow an implementor to use any utility 
+to populate the benchmark database, the resultant population must exactly 
+match the output of DBGEN. The source code has been provided to make the 
+process of building a compliant database population as simple as possible.
+
+2. What will DBGEN create?
+
+Without any command line options, DBGEN will generate 8 separate ascii
+files. Each file will contain pipe-delimited load data for one of the
+tables defined in the TPC-H and TPC-R database schemas. The default tables 
+will contain the load data required for a scale factor 1 database. By 
+default the file will be created in the current directory and be 
+named <table>.tbl. As an example, customer.tbl will contain the 
+load data for the customer table.
+
+When invoked with the '-U' flag, DBGEN will create the data sets to be 
+used in the update functions and the SQL syntax required to delete the 
+data sets. The update files will be created in the same directory as 
+the load data files and will be named "u_<table>.set". The delete 
+syntax will be written to "delete.set". For instance, the data set to 
+be used in the third query set to update the lineitem table will be 
+named "u_lineitem.tbl.3", and the SQL to remove those rows will be 
+found in "delete.3". The size of the update files can be controlled 
+with the '-r' flag.
+
+3. How is DBGEN built?
+
+Create an appropriate makefile, using makefile.suite as a basis, 
+and type make.  Refer to Porting.Notes for more details and for 
+suggested compile time options.
+
+4. Command Line Options for DBGEN
+
+DBGEN's output is controlled by a combination of command line options
+and environment variables. Command line options are assumed to be single
+letter flags preceded by a minus sign. They may be followed by an
+optional argument.
+
+option  argument    default     action
+------  --------    -------     ------
+-h                              Display a usage summary
+
+-f      none                    Force. Existing data files will be
+                                overwritten.
+
+-F      none        yes         Flat file output.
+
+-D      none                    Direct database load. ld_XXXX() routines
+                                must be defined in load_stub.c
+
+-s      <scale>     1           Scale of the database population. Scale
+                                1.0 represents ~1 GB of data
+
+-T      <table>                 Generate the data for a particular table
+                                ONLY. Arguments: p -- part/partuspp, 
+                                c -- customer, s -- supplier, 
+                                o -- orders/lineitem, n -- nation, r -- region,
+                                l -- code (same as n and r),
+                                O -- orders, L -- lineitem, P -- part, 
+                                S -- partsupp
+
+-O      d                       Generate SQL for delete function 
+                                instead of key ranges
+
+-O      f                       Allow over-ride of default output file 
+                                names
+
+-O      h                       Generate headers in flat ascii files.
+                                hd_XXX routines must be defined in 
+                                load_stub.c
+
+-O      m                       Flat files generate fixed length records
+
+-O      r                       Generate key ranges for the UF2 update 
+                                function
+
+-O      v                       Verify data set without generating it.
+
+-r      <percentage>     10     Scale each udpate file to the given 
+                                percentage (expressed in basis points)
+                                of the data set
+
+-v      none                    Verbose. Progress messages are 
+                                displayed as data is generated.
+
+-n      <name>                  Use database <name> for in-line load
+
+-C      <children>              Use <children> separate processes to 
+                                generate data
+
+-S      <n>                     Generate the <n>th part of a multi-part load
+                                or update set
+
+-U      <updates>               Create a specified number of data sets
+                                in flat files for the update/delete 
+                                functions
+
+-i      <n>                     Split the inserted rows in an refresh pair 
+								between <n> files
+
+-d      <n>                     Split the deleted rows in an refresh pair
+								between <n> files
+
+5. DBGEN limitations and compliant usage
+
+DBGEN is meant to be a robust population generator for use with the 
+TPC-H and TPC-R benchmarks. It is hoped that DBGEN will make it easier 
+to experiment with and become proficient in the execution of TPC decision 
+support benchmarks.  As a result, it includes a number of command line 
+options which are not, strictly speaking, necessary to generate a compliant 
+data set for a TPC-D run. In addition, some command line options will accept 
+arguments which result in the generation of NON-COMPLIANT data sets. Options 
+which should be used with care include:
+
+-s -- scale factor. TPC-H/TPC-R runs are only compliant when run against SF's 
+      of 1, 10, 30, 100, 300, 1000 ....
+-r -- refresh percentage. TPC-H/TPC-R runs are only compliant when run with 
+      -r 10, the default.
+
+6. Sample DBGEN executions
+
+DBGEN has been built to allow as much flexibility as possible, but is
+fundementally intended to generate two things: a database population 
+against which the queries in TPC-H and TPC-R can be run, and the updates 
+that are used during the update functions in TPC-H and TPC-R. Here are 
+some sample uses of DBGEN.
+
+  1. To generate the database population for the qualification database
+	dbgen -s 1
+  2. To generate the lineitem table only, for a scale factor 10 database,
+     and over-write any existing flat files:
+	dbgen -s 10 -f -T L
+  4. To geterate a 100GB data set in 1GB pieces, generate only the part and 
+     partsupplier tables, and include some progress reports along the way:
+	dbgen -s 100 -S 1 -C 100 -T p -v (to generate the first 1GB file)
+	dbgen -s 100 -S 2 -C 100 -T p -v (to generate the second 1GB file)
+        (and so on, incrementing the argument to -S each time)
+  5. To generate the update files needed for a 4 stream run of the throughput
+     test at 100 GB, using an existing set of seed files from an 8 process 
+     load:
+	dbgen -s 100 -U 4 -C 8
+     
+
+7. What is QGEN?
+
+QGEN is a query generation program for use with the TPC-H and TPC-R benchmarks.
+It is written in ANSI 'C' for portability, and has been successfully
+ported to over a dozen different systems. While the benchmark specifications
+allow an implementor to use any utility to create the benchmark query
+sets, QGEN has been provided to make the process of building
+a benchmark implementation as simple as possible.
+
+8. What will QGEN create?
+
+QGEN is a filter, triggered by :'s. It does line-at-a-time reads of its
+input (more on that later), scanning for :foo, where foo determines the
+substitution that occurs. Including:
+
+:<int>          replace with the appropriate value for parameter <int>
+:b              replace with START_TRAN (from tpcd.h)
+:c              replace with SET_DBASE (from tpcd.h)
+:n<int>         replace with SET_ROWCOUNT(<int>) (from tpcd.h)
+:o              replace with SET_OUTPUT (from tpcd.h)
+:q              replace with query number
+:s              replace with stream number
+:x              replace with GEN_QUERY_PLAN (from tpcd.h)
+
+Qgen takes an assortment of command line options, controlling which of these
+options should be active during the translation from template to EQT, and a
+list of query "names". It then translates the template found in
+$DSS_QUERY/<name>.sql and puts the result of stdout.
+
+Here is a sample query template:
+
+{  Sccsid:     @(#)1.sql        9.1.1.1     1/25/95  10:51:56  }
+:n 0
+:o
+select
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ avg(l_quantity) as avg_qty,
+ avg(l_extendedprice) as avg_price,
+ avg(l_discount) as avg_disc,
+ count(*) as count_order
+from lineitem
+where l_shipdate <= date '1998-12-01' - interval :1 day
+group by l_returnflag, l_linestatus
+order by l_returnflag, l_linestatus;
+
+And here is what is generated:
+$ qgen -d 1
+
+{return 0 rows}
+
+select
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ avg(l_quantity) as avg_qty,
+ avg(l_extendedprice) as avg_price,
+ avg(l_discount) as avg_disc,
+ count(*) as count_order
+from lineitem
+where l_shipdate <= date('1998-12-01') - interval (90)  day to day
+group by l_returnflag, l_linestatus
+order by l_returnflag, l_linestatus;
+
+See "Query Template Syntax" below for more detail on converting your prefered query
+phrasing for use with QGEN.
+
+9. How is QGEN built?
+
+QGEN is built by the same makefile that creates DBGEN. If the makefile
+is successfully creating DBGEN, no further compilation modifications
+should be necessary. You may need to modify some of the options which
+allow QGEN to integrate with your preferred query tool. Refer to
+Porting.Notes for more detail.
+
+10. Command Line Options for QGEN
+
+Like DBGEN, QGEN is controlled by a combination of command line options
+and environment variables (See "Environment Variables", below for more
+detail).  Command line options are assumed to be single
+letter flags preceded by a minus sign. They may be followed by an
+optional argument.
+
+option  argument    default     action
+------  --------    -------     ------
+-c      none                    Retain comments in translation of template to
+                                EQT
+
+-d      none                    Default. Use the parameter substitutions
+                                required for query validation
+
+-h                              Display a usage summary
+
+-i      <file>                  Use contents of <file> to init a query stream
+
+-l      <file>                  Save query parameters to <file>
+
+-n      <name>                  Use database <name> for queries
+
+-N                              Always use default rowcount, and ignore :n directives
+
+-o      <path>                  Save query n's output in <path>/n.<stream>
+                                Uses -p option, and uses :o tag
+
+-p      <stream>                Use the query permutation defined for
+                                stream <stream>. If this option is
+                                omited, EQT will be generated for the
+                                queries named on the command line.
+
+-r      <n>                     Seed the rnadom number generator with <n>
+
+-s      <n>                     Set scale to <n> for parameter 
+                                substitutions.
+
+-t      <file>                  Use contents of <file> to complete a query 
+                                stream
+
+-T      none                    Use time table format for date substitution
+
+-v      none                    Verbose. Progress messages are 
+                                displayed as data is generated.
+
+-x      none                    Generate a query plan as part of query
+                                execution.
+
+11. Query Template Syntax
+
+QGEN is a simple ASCII text filter, meant to translate query generalized
+query syntax("query template") into the executable query text(EQT) re-
+quired by the benchmarks. It provides a number of shorthands and syntactic 
+extensions that allow the automatic generation of query parameters and some 
+control over the operation of the benchmark implementation.
+
+QGEN first strips all comments from the query template, recognizing both
+{comment} and --comment styles. Next it traverses the query template
+one line at a time, locating required substitution points, called
+parameter tags. The values substituted for a given tag are summarized
+below.  QGEN does not support nested substitutions. That is, if
+the text substituted for tag itself contains a valid tag the second tag
+will not be expanded.
+
+Tag             Converted To            Based on
+===             ============            ========
+:c		database <dbname>;(1)   -n from the command line
+:x              set explain on;(1)      -x from the command line
+:<number>       paremeter <number>
+:s              stream number
+:o              output to outpath/qnum.stream;(1)
+					-o from command line, -s from 
+                                        command line
+:b              BEGIN WORK;(1)          -a from comand line
+:e              COMMIT WORK(1)          -a from command line
+:q              query number
+:n <number>                             sets rowcount to be returned 
+                                        to <number>, unless -N appears on the command line
+
+Notes:
+   (1)  This is Informix-specific syntax. Refer to Porting.Notes for
+   tailoring the generated text to your database environment.
+   
+12. Sample QGEN executions and Query Templates
+
+QGEN translates generic query templates into valid SQL. In addition, it 
+allows conditional inclusion of the commands necessary to connect to a 
+database, produce diagnostic output, etc. Here are some sample of QGEN
+usage, and the way that command line parameters and the query templates 
+interact to produce valid SQL.
+
+  Template, in $DSS_QUERY/1.sql:
+            :c
+            :o
+            select count(*) from foo;
+            :x
+            select count(*) from lineitem
+              where l_orderdate < ':1';
+
+  1. "qgen 1", would produce:
+      select count(*) from foo;
+      select count(*) from lineitem 
+        where l_orderdate < '1997-01-01'; 
+   Assuming that 1 January 1997 was a valid substitution for parameter 1.
+
+  2. "qgen -d -c dss1 1, would produce:
+      database dss1;
+      select count(*) from foo;
+      select count(*) from lineitem 
+        where l_orderdate < '1995-07-18'; 
+   Assuming that 18 July 1995 was the default substitution for parameter 1,
+    and using Informix syntax.
+
+  3. "qgen -d -c dss1 -x -o somepath 1, would produce:
+      database dss1;
+      output to "somepath/1.0"
+      select count(*) from foo;
+      set explain on;
+      select count(*) from lineitem 
+        where l_orderdate < '1995-07-18'; 
+   Assuming that 18 July 1995 was the default substitution for parameter 1,
+    and using Informix syntax.
+ 
+
+13. Environment Variables
+
+Enviroment variables are used to control features of DBGEN and QGEN 
+which are unlikely to change from one execution to another.
+
+Variable    Default     Action
+-------     -------     ------
+DSS_PATH    .           Directory in which to build flat files
+DSS_CONFIG  .           Directory in which to find configuration files
+DSS_DIST    dists.dss   Name of distribution definition file
+DSS_QUERY   .           Directory in which to find query templates
+
+14. Version Numbering in DBGEN and QGEN
+
+DBGEN and QGEN use a common version numbering algorithm. Each executable
+is stamped with a version number which is displayed in the usage messages
+available with the '-h' option. A version number is of the form:
+
+   V.R.P.M
+   | | | |
+   | | | |
+   | | | |
+   | | |  -- modification: alphabetic, incremented for any trivial changes 
+   | | |                   to the source (e.g, porting ifdef's)
+   | |  ---- patch level:  numeric, incremented for any minor bug fix
+   | |                     (e.g, qgen parameter range)
+   | ------- release:      numeric, incremented for each minor revision of the
+   |                       specification
+   |-------- version:      numeric, incremented for each major revision of the 
+                           specification
+
+An implementation of TPC-H or TPC-R is valid only if it conforms to the 
+following version usage rules:
+
+  -- The Version of DBGEN and QGEN must match the integer portion of the 
+     current specification revision
+
+The current revisions are:
+  DBGEN: 1.0.1
+  QGEN:  1.0.1
diff --git a/data/ssb/dbgen/bcd2.c b/data/ssb/dbgen/bcd2.c
new file mode 100644
index 0000000..30038ba
--- /dev/null
+++ b/data/ssb/dbgen/bcd2.c
@@ -0,0 +1,237 @@
+/* @(#)bcd2.c	2.1.8.1 */
+/*
+ * bcd.c: conversion routines for multi-byte arithmetic
+ *
+ * defined routines:
+ * bin_bcd2(long binary, long *low_res, long *high_res)
+ * bcd2_bin(long *dest, long bcd)
+ * bcd2_add(long *bcd_low, long *bcd_high, long addend)
+ * bcd2_sub(long *bcd_low, long *bcd_high, long subend)
+ * bcd2_mul(long *bcd_low, long *bcd_high, long multiplier)
+ * bcd2_div(long *bcd_low, long *bcd_high, long divisor)
+ * long bcd2_mod(long *bcd_low, long *bcd_high, long modulo)
+ * long bcd2_cmp(long *bcd_low, long *bcd_high, long compare)
+ */
+#include <stdio.h>
+#include "bcd2.h"	/* for function prototypes */
+
+#define DIGITS_PER_LONG 7
+#define WORD_DIVISOR    10000000
+#define GET_DIGIT(num, low, high) \
+    ((num) >= DIGITS_PER_LONG)? \
+        (high & (0xF << (4 * ((num) - DIGITS_PER_LONG)))) \
+            >> (((num) - DIGITS_PER_LONG) * 4): \
+        (low & (0xF << (4 * (num)))) >> ((num) * 4)
+#define SET_DIGIT(value, num, low, high) \
+    if ((num) >= DIGITS_PER_LONG) \
+        { \
+        *high &= \
+            (0xFFFFFFF ^ (0xF << (4 * ((num) - DIGITS_PER_LONG)))); \
+        *high |= (value << (4 * ((num) - DIGITS_PER_LONG))); \
+        } \
+    else \
+        { \
+        *low = (*low & (0xFFFFFFF ^ (0xF << (4 * (num))))); \
+        *low |= (value << (4 * (num))); \
+        }
+int 
+bin_bcd2(long binary, long *low_res, long *high_res)
+{
+    char number[15],
+         *current;
+    int count;  
+    long *dest;
+
+	*low_res = *high_res = 0;
+    sprintf(number, "%014ld", binary);
+    for (current = number, count=13; *current; current++, count--)
+        {
+        dest = (count < DIGITS_PER_LONG)?low_res:high_res;
+        *dest = *dest << 4;
+        *dest |= *current - '0';
+        }
+    return(0);
+}
+
+int
+bcd2_bin(long *dest, long bcd)
+{
+    int count;
+    long mask;
+         
+    count = DIGITS_PER_LONG - 1;
+    mask = 0xF000000;
+	*dest = 0;
+    while (mask)
+        {
+        *dest *= 10;
+        *dest += (bcd & mask) >> (4 * count);
+        mask = mask >> 4;
+        count -= 1;
+        }
+    return(0);
+}
+
+int
+bcd2_add(long *bcd_low, long *bcd_high, long addend)
+{
+    long tmp_lo, tmp_hi, carry, res;
+    int digit;
+
+    bin_bcd2(addend, &tmp_lo, &tmp_hi);
+    carry = 0;
+    for (digit=0; digit < 14; digit++)
+        {
+        res = GET_DIGIT(digit, *bcd_low, *bcd_high); 
+        res += GET_DIGIT(digit, tmp_lo, tmp_hi);
+        res += carry;
+        carry = res / 10;
+        res %= 10;
+        SET_DIGIT(res, digit, bcd_low, bcd_high);
+        }
+    return(carry);
+}
+
+int
+bcd2_sub(long *bcd_low, long *bcd_high, long subend)
+{
+    long tmp_lo, tmp_hi, carry, res;
+    int digit;
+
+    bin_bcd2(subend, &tmp_lo, &tmp_hi);
+    carry = 0;
+    for (digit=0; digit < 14; digit++)
+        {
+        res = GET_DIGIT(digit, *bcd_low, *bcd_high); 
+        res -= GET_DIGIT(digit, tmp_lo, tmp_hi);
+        res -= carry;
+        if (res < 0) 
+			{
+			res += 10;
+			carry = 1;
+			}
+        SET_DIGIT(res, digit, bcd_low, bcd_high);
+        }
+    return(carry);
+}
+
+int
+bcd2_mul(long *bcd_low, long *bcd_high, long multiplier)
+{
+    long tmp_lo, tmp_hi, carry, m_lo, m_hi, m1, m2;
+    int udigit, ldigit, res;
+
+    tmp_lo = *bcd_low;
+    tmp_hi = *bcd_high;
+    bin_bcd2(multiplier, &m_lo, &m_hi);
+    *bcd_low = 0;
+    *bcd_high = 0;
+    carry = 0;
+    for (ldigit=0; ldigit < 14; ldigit++)
+        {
+        m1 = GET_DIGIT(ldigit, m_lo, m_hi); 
+        carry = 0;
+        for (udigit=0; udigit < 14; udigit++)
+            {
+            m2 = GET_DIGIT(udigit, tmp_lo, tmp_hi);
+            res = m1 * m2;
+            res += carry;
+            if (udigit + ldigit < 14)
+                {
+                carry = GET_DIGIT(udigit + ldigit, *bcd_low, *bcd_high);
+                res += carry;
+                }
+            carry = res / 10;
+            res %= 10;
+            if (udigit + ldigit < 14)
+                SET_DIGIT(res, udigit + ldigit, bcd_low, bcd_high);
+            }
+        }
+    return(carry);
+}
+
+int
+bcd2_div(long *bcd_low, long *bcd_high, long divisor)
+{
+    long tmp_lo, tmp_hi, carry, d1, res, digit;
+    
+
+    carry = 0;
+    tmp_lo = *bcd_low;
+    tmp_hi = *bcd_high;
+    *bcd_low = *bcd_high = 0;
+    for (digit=13; digit >= 0; digit--)
+        {
+        d1 = GET_DIGIT(digit, tmp_lo, tmp_hi);
+        d1 += 10 * carry; 
+        res = d1 / divisor;
+        carry = d1 % divisor;
+        SET_DIGIT(res, digit, bcd_low, bcd_high);
+        }
+    return(carry);
+}
+
+long
+bcd2_mod(long *bcd_low, long *bcd_high, long modulo)
+{
+	long tmp_low, tmp_high;
+
+	tmp_low = *bcd_low;
+	tmp_high = *bcd_high;
+	while (tmp_high || tmp_low > modulo)
+		bcd2_sub(&tmp_low, &tmp_high, modulo);
+	return(tmp_low);
+}
+
+long
+bcd2_cmp(long *low1, long *high1, long comp)
+{
+	long temp = 0;
+
+    bcd2_bin(&temp, *high1);
+	if (temp > 214) 
+		return(1);
+    bcd2_bin(&temp, *low1);
+	return(temp - comp);
+}
+
+#ifdef TEST_BCD
+#include <values.h>
+
+main()
+{
+long bin, low_bcd, high_bcd;
+int i;
+
+bin = MAXINT;
+printf("%ld\n", bin);
+bin_bcd2(bin, &low_bcd, &high_bcd);
+printf("%ld  %ld\n", high_bcd, low_bcd);
+bin = 0;
+bcd2_bin(&bin, high_bcd);
+bcd2_bin(&bin, low_bcd);
+printf( "%ld\n", bin);
+for (i=9; i >= 0; i--)
+    printf("%dth digit in %d is %d\n", 
+        i, bin, GET_DIGIT(i, low_bcd, high_bcd));
+bcd2_add(&low_bcd, &high_bcd, MAXINT);
+bin = 0;
+bcd2_bin(&bin, high_bcd);
+high_bcd = bin;
+bin = 0;
+bcd2_bin(&bin, low_bcd);
+low_bcd = bin;
+printf( "%ld%07ld\n", high_bcd, low_bcd);
+bin_bcd2(14, &low_bcd, &high_bcd);
+bcd2_mul(&low_bcd, &high_bcd, 23L);
+bin = 0;
+bcd2_bin(&bin, high_bcd);
+bcd2_bin(&bin, low_bcd);
+printf( "%ld\n", bin);
+bcd2_div(&low_bcd, &high_bcd, 10L);
+bin = 0;
+bcd2_bin(&bin, high_bcd);
+bcd2_bin(&bin, low_bcd);
+printf( "%ld\n", bin);
+}
+#endif /* TEST */
diff --git a/data/ssb/dbgen/bcd2.h b/data/ssb/dbgen/bcd2.h
new file mode 100644
index 0000000..6ea92a1
--- /dev/null
+++ b/data/ssb/dbgen/bcd2.h
@@ -0,0 +1,11 @@
+/* 
+ *  Sccsid:     @(#)bcd2.h	2.1.8.1
+ */
+int bin_bcd2(long binary, long *low_res, long *high_res);
+int bcd2_bin(long *dest, long bcd);
+int bcd2_add(long *bcd_low, long *bcd_high, long addend);
+int bcd2_sub(long *bcd_low, long *bcd_high, long subend);
+int bcd2_mul(long *bcd_low, long *bcd_high, long multiplier);
+int bcd2_div(long *bcd_low, long *bcd_high, long divisor);
+long bcd2_mod(long *bcd_low, long *bcd_high, long modulo);
+long bcd2_cmp(long *bcd_low, long *bcd_high, long compare);
diff --git a/data/ssb/dbgen/bcd2.o b/data/ssb/dbgen/bcd2.o
new file mode 100644
index 0000000000000000000000000000000000000000..469699c95d4bc145902e4505694f744398e5787d
GIT binary patch
literal 4536
zcmbtWeQX>@6`%E8>~m`G9S|j!646V1Ew)0Bb8NsVsJ*rC>UU)WhT5g54#vK7Vk`a#
z_8oFc<S5!CFuGhCRs0cDsp=o)OKQa*h~Na8+&R=Hp&;WUB?74m{YY%Nq)nryXSen7
z-t4Ts-nybHBhAj6-@LDR@6F5}is}!PC<?Qh!tP@$GYe&`QC*?Kf($d2RkB;&&n~t<
z1jZXRF<t<E3;5*(q}_MbAP?|JpsFqlQzaw-Q>9|a91N60rjqEhUi_RRbF(z{o{j{{
zk7ScyhKwF4N4|}GvG$U&SO)wCc=cP5-Uzu@+}?9pV>Se*pnMv5DHu7$ds;b?P0c_W
z?ImElX+2djPD;V}W6h(xF9-wV?3pSA&(exU>t5h*>07gqFF<C)6WCj0xq{o6a~liq
zr(OlVNhrBMlG?*syVfQ)>j(aU*z7+(wAoBszz3ORz#}Z`=|sSzr#k|^^RE?)#dD@z
z+Og?^h(PSj-aTfU-y7wZ`AseVdMy4zj8E#-DHh8w>07U&gRx9SDJXvf{-@ZxbC6ps
zK?-t(`=NG1JNyo6_o5I~N12{!cw(1wTd%zt<A32dWBJ$h@>$?frWskqm~<OcZY>io
z8imralVoqfX0KkDyYlhPxmH_Su&lW>qfj}P;nR?A6It0HY9dvM3GqL=TkFtvYu{Z-
z%O?wNKLs}t@M+@|-SZ;4XDPbc8KOn*wBJ*eX`;;LABymo`9l60#BT$C8>-tG<QMg=
zHz2<RwQpl2y+3{mxh2=(+Z0ve#XLa%6)1lRcnZUqQ#4~jp*R{@><gu1zd(@)&#h?w
zj8J@SS@CGt;;?D0x0Kdv#$x5z&mFPawklR&Osv=WTl2q@#N5N^-Ev)P-?oMDk+tg+
z@Gz#M;#_(lT}tN@Yqc#9rh;u0s$eQgk;BvK5(6HoM&hfX!xbjVuZcKMMFLwW=-MBM
zPkUZ`MLEy?4CyG(N2st~-4*|rUfmI>iRIs$dkz;e6~86c2%o`t#%G|qE#SdW&27T8
zVKGMdMY9^Ikg*Dt35Z{UzKe81Ah)oMPv;7E*5)9dlQq99{vW-10M&BkV(ldq7iweW
zmvuhC<}$*ScQoS#%-9=l1D(cvQQkgKlV8DasrSS;aTn+=?r#gvt$6H5k0boeC|`)s
zk~q4eprZW7{5L35fOm~q)!w5N6U^2lR%L6Q7a%?XeOc4-!nWMvowd_&%9un)sAh6_
z3>9NZ^&aU!`^FMF^6#Z$u{l%Wx5vhZvC+K0&MOl}vGUNfMiKcgBcBKrdP^4tRJ%TQ
zq_!^yDz_4-?Q=b=N=aOEG5#K2aWOulr@x@5@3yY*sWM8RcBX;8<5l=QUx6QSojp${
zZN8b`pfunao7A_?qG#sK>TZRf#@=0AuKrjxR{vEfpN#Q(I9eQi5R5l5HnSL;IX7fl
z6rc<0H>6A)D^y0e;M&T-QU<(!{xqL9(;%*G^K}BA|Ik&P#)l<>)rZdtr(65zOllqM
zyfXGV$n?{}IEZWfpZNHgCj+a%1IEFCkKx0x^U9%@9jCR-`P2x@RpKbFHtwNE(H+2G
z;aR*6=`RTb<8*Ps;jac>!f|X9Y4<EwSZWJuufm5+2&Bvh6VmPg)LxnYyO6&tRM*g-
zV63IN`Cflbdv|Iunex}w1sj5)d+JhRS~s?1M=;b-D`B+b#_q$u4tvYj+ZjI9S;EBW
z$8iI8YRY5ZV#=sb*<4ZPIf;(eAht!UFh<9kMOELi5>2f#ToE-~^5lB8N>hEB>e22H
z`a$yEE5|U79g^Nn8-GCcohWHit1_-8)qmWrsWr#eX==zQji{}z`<3--2q8`NBTJL&
zYa+#@<7w>FwEw9?T~mjSvabmZjD3y%Q_zpc$&VA{N5<Wx)*L54j+JU^qfr)7Pb#hh
zsHqV#bO?ED4xxAmdjt7`?XUDfeHqes+=b2`qkS3cO3tZ%43)VLGnTzl<}s349m`&O
zEI*ISKasp6PTo+7Zs(=tk7}gC&2ARgT6eAS2j~%uR3)n${l|UU#xH6Cv5nZNuAD7^
zz*_hh5hp#Cyj3UnN&LE0Bx(ie7Kum1zopDSVyzNCDhpWT6tV9}d|GZn<OZ>Bi9aQN
zP8k#NM{J+OtHgE3n20rEtUI3A5sb5--W+1xeS@7eXPuoR`-l4mlY2Vjz58gE?27Lr
zx^qugU%!y)L}rudN+e8TB-L#a11ZUq==*_0h9pmXU_V)9JH?mL`4ngKeY~)#S?7w*
zEN#uge<k?|r>$A|W?4IgQ@8N2jH8o()P~#n56k#E`Cqi*cK#P-+@1W_WZVfaxB0U{
z#@)%^V#A5F{6Fv!_>XM3Jq~`9rAFJ6TNHD;52#tVRgd2j=n8&a&YyAMRz1!-aH}30
zW&ADuRz3DRaH}3qIdH2US#adQK(ebF$K<d%_FB|%Pk&c1*)y7C!9-WGiv_z!Mp$ro
zh$h5d7p!B!_|U*W&mgi44kdenc=g{SidN3|4yJ<phllp}3?~nuC@Jje?c6imHP9nB
z_J0n3l&7u9^e<}*ODZk%W-Z7tSW~tcDXNX|GGY?Hyu%z@r2tKdH199Q=!b>c-Pj4Y
z-Z{dk{n=sHL``0crjhPp()mf{>rj^L@4#+vmVKuiayvqX_`}!<3t_jn87WBq*_%^7
zhKm+v=UZ|bmWxhax=)Ff>UdR?f%dNgSK@EM-$#OUI_3RRKI<@O<<FAuM`Eq~7b;AS
ztYoD8)4gKJTlrthHi=v%W^7jdkUFyClqWW4?h#@kS$|2I*qUQI>{Gb5mi#z9sMvmJ
K4$EJs{C@#d@HHI(

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/bm_utils.c b/data/ssb/dbgen/bm_utils.c
new file mode 100644
index 0000000..5da29a0
--- /dev/null
+++ b/data/ssb/dbgen/bm_utils.c
@@ -0,0 +1,589 @@
+/* @(#)bm_utils.c	2.1.8.2 */
+/*
+ *
+ * Various routines that handle distributions, value selections and
+ * seed value management for the DSS benchmark. Current functions:
+ * env_config -- set config vars with optional environment override
+ * yes_no -- ask simple yes/no question and return boolean result
+ * a_rnd(min, max) -- random alphanumeric within length range
+ * pick_str(size, set) -- select a string from the set of size
+ * read_dist(file, name, distribution *) -- read named dist from file
+ * tbl_open(path, mode) -- std fopen with lifenoise
+ * julian(date) -- julian date correction
+ * rowcnt(tbl) -- proper scaling of given table
+ * e_str(set, min, max) -- build an embedded str
+ * agg_str() -- build a string from the named set
+ * dsscasecmp() -- version of strcasecmp()
+ * dssncasecmp() -- version of strncasecmp()
+ * getopt()
+ * set_state() -- initialize the RNG
+ */
+
+/*this has to be put on top...*/
+#ifdef LINUX
+/* turn on GNU extensions, incl O_DIRECT */
+/* O_LARGEFILE is defined in fcntl.h*/
+#define _GNU_SOURCE
+#endif
+
+#include "dss.h"
+#include <stdio.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+
+#ifdef HP
+#include <strings.h>
+#endif            /* HP */
+#include <ctype.h>
+#include <math.h>
+#ifndef _POSIX_SOURCE
+#include <malloc.h>
+#endif /* POSIX_SOURCE */
+
+#include <fcntl.h>
+
+#ifdef IBM
+#include <sys/mode.h>
+#endif /* IBM */
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Lines added by Chuck McDevitt for WIN32 support */
+#if	(defined(WIN32)||defined(DOS))
+#ifndef _POSIX_
+#include <io.h>
+#ifndef S_ISREG
+
+#define S_ISREG(m) ( ((m) & _S_IFMT) == _S_IFREG )
+#define S_ISFIFO(m) ( ((m) & _S_IFMT) == _S_IFIFO )
+
+#endif 
+#endif
+#ifndef stat
+#define stat _stat
+#endif
+#ifndef fdopen
+#define fdopen _fdopen
+#endif
+#ifndef open
+#define open _open
+#endif
+#ifndef O_RDONLY
+#define O_RDONLY _O_RDONLY
+#endif
+#ifndef O_WRONLY
+#define O_WRONLY _O_WRONLY
+#endif
+#ifndef O_CREAT
+#define O_CREAT _O_CREAT
+#endif
+#endif
+/* End of lines added by Chuck McDevitt for WIN32 support */
+#include "dsstypes.h"
+
+
+static char alpha_num[65] =
+"0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,";
+
+#if defined(__STDC__) || defined(__cplusplus)
+#define PROTO(s) s
+#else
+#define PROTO(s) ()
+#endif
+
+char     *getenv PROTO((const char *name));
+void usage();
+long *permute_dist(distribution *d, long stream);
+extern long Seed[];
+
+/*
+ * env_config: look for a environmental variable setting and return its
+ * value; otherwise return the default supplied
+ */
+char     *
+env_config(char *var, char *dflt)
+{
+   static char *evar;
+
+   if ((evar = getenv(var)) != NULL)
+      return (evar);
+   else
+      return (dflt);
+}
+
+/*
+ * return the answer to a yes/no question as a boolean
+ */
+long
+yes_no(char *prompt)
+{
+    char      reply[128];
+
+#ifdef WIN32
+/* Disable warning about conditional expression is constant */ 
+#pragma warning(disable:4127)
+#endif 
+
+    while (1)
+        {
+#ifdef WIN32
+#pragma warning(default:4127)
+#endif 
+        printf("%s [Y/N]: ", prompt);
+        gets(reply);
+        switch (*reply)
+            {
+            case 'y':
+            case 'Y':
+                return (1);
+            case 'n':
+            case 'N':
+                return (0);
+            default:
+                printf("Please answer 'yes' or 'no'.\n");
+            }
+        }
+}
+
+/*
+ * generate a random string with length randomly selected in [min, max]
+ * and using the characters in alphanum (currently includes a space
+ * and comma)
+ */
+int
+a_rnd(int min, int max, int column, char *dest)
+{
+   long      i,
+             len,
+             char_int;
+
+   RANDOM(len, min, max, column);
+   for (i = 0; i < len; i++)
+      {
+      if (i % 5 == 0)
+        RANDOM(char_int, 0, MAX_LONG, column);
+      *(dest + i) = alpha_num[char_int & 077];
+      char_int >>= 6;
+      }
+   *(dest + len) = '\0';
+   return (len);
+}
+
+/*
+ * embed a randomly selected member of distribution d in alpha-numeric
+ * noise of a length rendomly selected between min and max at a random
+ * position
+ */
+void
+e_str(distribution *d, int min, int max, int stream, char *dest)
+{
+    char strtmp[MAXAGG_LEN + 1];
+    long loc;
+    int len;
+
+    a_rnd(min, max, stream, dest);
+    pick_str(d, stream, strtmp);
+    len = strlen(strtmp);
+    RANDOM(loc, 0, (strlen(dest) - 1 - len), stream);
+    strncpy(dest + loc, strtmp, len);
+
+    return;
+}
+
+
+/*
+ * return the string associate with the LSB of a uniformly selected
+ * long in [1, max] where max is determined by the distribution
+ * being queried
+ */
+int
+pick_str(distribution *s, int c, char *target)
+{
+    long      i = 0;
+    long      j;
+
+    RANDOM(j, 1, s->list[s->count - 1].weight, c);
+    while (s->list[i].weight < j)
+        i++;
+    strcpy(target, s->list[i].text);
+    return(i);
+}
+
+/*
+ * unjulian (long date) -- return(date - STARTDATE)
+ */
+long
+unjulian(long date)
+{
+    int i;
+    long res = 0;
+
+    for (i = STARTDATE / 1000; i < date / 1000; i++)
+        res += 365 + LEAP(i);
+    res += date %  1000 - 1;
+
+    return(res);
+}
+
+long
+julian(long date)
+{
+    long       offset;
+    long      result;
+    long      yr;
+    long      yend;
+
+    offset = date - STARTDATE;
+    result = STARTDATE;
+
+#ifdef WIN32
+/* Disable warning about conditional expression is constant */ 
+#pragma warning(disable:4127)
+#endif 
+
+    while (1)
+        {
+#ifdef WIN32 
+#pragma warning(default:4127)
+#endif 
+        yr = result / 1000;
+        yend = yr * 1000 + 365 + LEAP(yr);
+        if (result + offset > yend)   /* overflow into next year */
+            {
+            offset -= yend - result + 1;
+            result += 1000;
+            continue;
+            }
+        else
+            break;
+        }
+    return (result + offset);
+}
+
+/*
+* load a distribution from a flat file into the target structure;
+* should be rewritten to allow multiple dists in a file
+*/
+void
+read_dist(char *path, char *name, distribution *target)
+{
+FILE     *fp;
+char      line[256],
+         token[256],
+        *c;
+long      weight,
+         count = 0,
+         name_set = 0;
+
+    if (d_path == NULL)
+		{
+		sprintf(line, "%s%c%s", 
+			env_config(CONFIG_TAG, CONFIG_DFLT), PATH_SEP, path);
+		fp = fopen(line, "r");
+		OPEN_CHECK(fp, line);
+		}
+	else
+		{
+		fp = fopen(d_path, "r");
+		OPEN_CHECK(fp, d_path);
+		}
+    while (fgets(line, sizeof(line), fp) != NULL)
+        {
+        if ((c = strchr(line, '\n')) != NULL)
+            *c = '\0';
+        if ((c = strchr(line, '#')) != NULL)
+            *c = '\0';
+        if (*line == '\0')
+            continue;
+
+        if (!name_set)
+            {
+            if (dsscasecmp(strtok(line, "\n\t "), "BEGIN"))
+                continue;
+            if (dsscasecmp(strtok(NULL, "\n\t "), name))
+                continue;
+            name_set = 1;
+            continue;
+            }
+        else
+            {
+            if (!dssncasecmp(line, "END", 3))
+                {
+                fclose(fp);
+                return;
+                }
+            }
+
+        if (sscanf(line, "%[^|]|%ld", token, &weight) != 2)
+            continue;
+
+        if (!dsscasecmp(token, "count"))
+            {
+            target->count = weight;
+            target->list =
+                (set_member *)
+                    malloc((size_t)(weight * sizeof(set_member)));
+            MALLOC_CHECK(target->list);
+            target->max = 0;
+            continue;
+            }
+        target->list[count].text =
+            (char *) malloc((size_t)(strlen(token) + 1));
+        MALLOC_CHECK(target->list[count].text);
+        strcpy(target->list[count].text, token);
+        target->max += weight;
+        target->list[count].weight = target->max;
+
+        count += 1;
+        } /* while fgets() */
+
+    if (count != target->count)
+        {
+        fprintf(stderr, "Read error on dist '%s'\n", name);
+        fclose(fp);
+        exit(1);
+        }
+	target->permute = (long *)NULL;
+    fclose(fp);
+    return;
+}
+
+/*
+ * standard file open with life noise
+ */
+
+FILE     *
+tbl_open(int tbl, char *mode)
+{
+    char      prompt[256];
+    char      fullpath[256];
+    FILE     *f;
+    struct stat fstats;
+    int      retcode;
+
+
+    if (*tdefs[tbl].name == PATH_SEP)
+        strcpy(fullpath, tdefs[tbl].name);
+    else
+        sprintf(fullpath, "%s%c%s",
+            env_config(PATH_TAG, PATH_DFLT), PATH_SEP, tdefs[tbl].name);
+
+    retcode = stat(fullpath, &fstats);
+    if (retcode && (errno != ENOENT))
+        {
+        fprintf(stderr, "stat(%s) failed.\n", fullpath);
+        exit(-1);
+        }
+    if (S_ISREG(fstats.st_mode) && !force && *mode != 'r' )
+        {
+        sprintf(prompt, "Do you want to overwrite %s ?", fullpath);
+        if (!yes_no(prompt))
+            exit(0);
+        }
+
+    if (S_ISFIFO(fstats.st_mode))
+        {
+        retcode =
+            open(fullpath, ((*mode == 'r')?O_RDONLY:O_WRONLY)|O_CREAT);
+        f = fdopen(retcode, mode);
+        }
+    else{
+
+#ifdef LINUX
+      /* allow large files on Linux */
+      /*use open to first to get the in fd and apply regular fdopen*/
+
+	/*cheng: Betty mentioned about write mode problem here, added 066*/
+      retcode =
+		  open(fullpath, ((*mode == 'r')?O_RDONLY:O_WRONLY)|O_CREAT|O_LARGEFILE,0644);
+        f = fdopen(retcode, mode);
+#else
+        f = fopen(fullpath, mode);
+#endif
+
+    }
+    OPEN_CHECK(f, fullpath);
+    if (header && columnar && tdefs[tbl].header != NULL)
+        tdefs[tbl].header(f);
+
+    return (f);
+}
+
+
+/*
+ * agg_str(set, count) build an aggregated string from count unique
+ * selections taken from set
+ */
+void
+agg_str(distribution *set, long count, long col, char *dest)
+{
+	distribution *d;
+	int i;
+
+	
+	d = set;
+	*dest = '\0';
+	for (i=0; i < count; i++)
+		{
+		strcat(dest, DIST_MEMBER(set,*permute_dist(d, col)));
+
+		strcat(dest, " ");
+		d = (distribution *)NULL;
+		}
+	*(dest + strlen(dest) - 1) = '\0';
+    return;
+}
+
+
+long
+dssncasecmp(char *s1, char *s2, int n)
+{
+    for (; n > 0; ++s1, ++s2, --n)
+        if (tolower(*s1) != tolower(*s2))
+            return ((tolower(*s1) < tolower(*s2)) ? -1 : 1);
+        else if (*s1 == '\0')
+            return (0);
+        return (0);
+}
+
+long
+dsscasecmp(char *s1, char *s2)
+{
+    for (; tolower(*s1) == tolower(*s2); ++s1, ++s2)
+        if (*s1 == '\0')
+            return (0);
+    return ((tolower(*s1) < tolower(*s2)) ? -1 : 1);
+}
+
+#ifndef STDLIB_HAS_GETOPT
+int optind = 0;
+int opterr = 0;
+char *optarg = NULL;
+
+int
+getopt(int ac, char **av, char *opt)
+{
+    static char *nextchar = NULL;
+    char *cp;
+    char hold;
+
+    if (optarg == NULL)
+        {
+        optarg = (char *)malloc(BUFSIZ);
+        MALLOC_CHECK(optarg);
+        }
+
+    if (!nextchar || *nextchar == '\0')
+        {
+        optind++;
+        if (optind == ac)
+            return(-1);
+        nextchar = av[optind];
+        if (*nextchar != '-')
+            return(-1);
+        nextchar +=1;
+        }
+
+    if (nextchar && *nextchar == '-')   /* -- termination */
+        {
+        optind++;
+        return(-1);
+        }
+    else        /* found an option */
+        {
+        cp = strchr(opt, *nextchar);
+        nextchar += 1;
+        if (cp == NULL) /* not defined for this run */
+            return('?');
+        if (*(cp + 1) == ':')   /* option takes an argument */
+            {
+            if (*nextchar)
+                {
+                hold = *cp;
+                cp = optarg;
+                while (*nextchar)
+                    *cp++ = *nextchar++;
+                *cp = '\0';
+                *cp = hold;
+                }
+            else        /* white space separated, use next arg */
+                {
+                if (++optind == ac)
+                    return('?');
+                strcpy(optarg, av[optind]);
+                }
+            nextchar = NULL;
+            }
+        return(*cp);
+        }
+}
+#endif /* STDLIB_HAS_GETOPT */
+
+char **
+mk_ascdate(void)
+{
+    char **m;
+    dss_time_t t;
+    int i;
+
+    m = (char**) malloc((size_t)(TOTDATE * sizeof (char *)));
+    MALLOC_CHECK(m);
+    for (i = 0; i < TOTDATE; i++)
+        {
+        m[i] = (char *)malloc(DATE_LEN * sizeof(char));
+        MALLOC_CHECK(m[i]);
+        mk_time((long)(i + 1), &t);
+        strcpy(m[i], t.alpha);
+        }
+
+    return(m);
+}
+
+/*
+ * set_state() -- initialize the RNG so that
+ * appropriate data sets can be generated.
+ * For each table that is to be generated, calculate the number of rows/child, and send that to the
+ * seed generation routine in speed_seed.c. Note: assumes that tables are completely independent.
+ * Returns the number of rows to be generated by the named step.
+ */
+long
+set_state(int table, long sf, long procs, long step, long *extra_rows)
+{
+    int i;
+	long rowcount, remainder, result;
+	
+    if (sf == 0 || step == 0)
+        return(0);
+
+	rowcount = tdefs[table].base / procs;
+	if ((sf / procs) > (int)MAX_32B_SCALE)
+		INTERNAL_ERROR("SCALE OVERFLOW. RE-RUN WITH MORE CHILDREN.");
+	rowcount *= sf;
+	remainder = (tdefs[table].base % procs) * sf;
+	rowcount += remainder / procs;
+	result = rowcount;
+	for (i=0; i < step - 1; i++)
+		{
+		if (table == LINE)	/* special case for shared seeds */
+			tdefs[table].gen_seed(1, rowcount);
+		else
+			tdefs[table].gen_seed(0, rowcount);
+		/* need to set seeds of child in case there's a dependency */
+		/* NOTE: this assumes that the parent and child have the same base row count */
+			if (tdefs[table].child != NONE) 
+			tdefs[tdefs[table].child].gen_seed(0,rowcount);
+		}
+	*extra_rows = remainder % procs;
+	if (step > procs)	/* moving to the end to generate updates */
+		tdefs[table].gen_seed(*extra_rows);
+
+	return(result);
+}
+
+
+
+
+
+
+
+
diff --git a/data/ssb/dbgen/bm_utils.o b/data/ssb/dbgen/bm_utils.o
new file mode 100644
index 0000000000000000000000000000000000000000..fccd1821b3a60a2bf2debaf0a5b010ce40c3db48
GIT binary patch
literal 12856
zcmbuE4|rSEb-=G|nHUo!(;DiL6%>$(O%q{-1Z;2`KU=o1Kz7pDNq{6ZiY&(_w&Y1q
zBxcUPPj;3!MZg%aY=e)jbX{B80UcjR0!?h3zx)ju2{2X)loo<RNJ~kaK;xZr-@VeY
zVr65y->3KP`@M6|J@?#u&%O6)U7fE!$6{eBS=e`3`bkm7MrWk+)goKXE@6f2bNdx%
znz`}$$H@4=Ah&ylmkcA-7HTaEoB4)gTkwtXWgAaeOG-t)-?<6y?m=I&1n%(ix4xv%
z&yD}|8Nb_w25iGOrdH##^n-n?)matNtE)omTvbi_+^XAS*KR|3o_J7bZR6i564k`q
ze8X<6d&x3wskUsms#h=e8J`K2BM%FWBQKh_VWDB<dGSuzzF1o%?T;TTHAan7nsJ01
zuRt42<=kj-7P}Aq!Csh$cDCDH;(kvs7VrjM<$)tSaDpfMocU%85*}y%b13Nj<%)JQ
zH3m-lK%h|hU|?`vWUmLb@C2w%wZcBI8H(}v?ozWE+V{2AVdHe!P<+V30|&fjO@T4#
zDX3@UZ#Yy}$c-1`AI#P4TZ@V|wS1fLq0tMtfjmBJL4kZ;`PSNdx$&0w)TjLQGmrt1
zxxDNJp0qx|D~Ds-ea7#1jE#-0+lhwz46}-nAEcZFof|(^3_Z(@!=!UTwR`X!$8^V5
za&*ijKJ=cI=QiYX<0GD!e_|*0ope%jK2J<`^2&lWZ9G|1C>$otHD2KH!9rfiPxO7p
z6A>tDb|csf&dN_YD|S6Ns(8t=VIIge>!}5mt50Bm7^g{9hSfQ8pU^+@WU4)>4zKVU
zUm2&2BgQK}V|ZN4N@2_zL&hhfj4#;*-DiAa9QPR?nMB&<HFo<FVnDZh?V(p-bX<%+
zyk}9Yg&V&%TN`-A8+e149puURV`8KTH;Z@pBB?`H<t~M8QU##TN};7zb8uHeODp8a
zRw7cNF~kFhd2;m_R=y{#7P~*6^q<!_&3C+Zs`)qn@{hY6+!*DjKH)=WCSgvFgbia2
zD=lnoK<LoKT-mi08e-0xD>n>vL+{}zwCs=Xv+!h-g~#_U<CRC&+`x@jjiKt3`*LG%
z^T2Do^T3q@Bj4jgr%N>Bj5c)UN>ADAD-X0)zGi=D$iDT!$TOz)v^_p!zqAK)(w@#k
zALqgxDtrCbq$ht&L>XW5uO8cSc5Ih_g@t)jt<kolzrD|0^ycF|a|+&k9NOd^-#X)U
zUN*!#hfpC;K7|c|icSvY#-?*)zxUJ#x4a16fH{+@9p^>Tx*i5r4zC=tKQw6HI>;?2
zyvd>pUird0D>n`f^RiJGsupkAAm6YLb&vFmcw$Dvf68IZ_!C4+GN}5UTY$O;d0?N^
zZso~K(R^RRnWL?*a$aZezYhJJoQ;$x=itp8D8?X9KJUCA-^P#SYc1OL>x76HZaXlL
zi$O2?%N%y`-4Gdmr%DMnFsR!^CSH+aA9xoOz)&kLVJ8@y!`3rq?>{}B6(YL@l&`-z
zQf+@|Z?%2v$@t!CrF6#+;Z?dyV;?u(H7hrU`M^8&{(51rXa>{WJT7z7c;Hi>^!yxR
z6@&=W2Qa^Liuu4Hy#NNe&^dO9CrqEECWC#T95q5T*svR2jMHK`Wes`(78PN^Q3e5x
zCpVieGpi6D;t63y+Ui{(D1)lB&pvQ5pq<s=3uw2!AD?{5A3E;`;7j&Pl^LFg$dJ~l
z!UQan_JN<B9UI#PMa`c4`kp-cG$>F_(YC`p1uMiNBwd%KT_9&xDI9XnVh8aYA}Tjt
z+G~O3{N&n3uW{CEeDO=PR1@|#CfXsW>ZzqobeA`wThpORhOG8|)?5dS%sm!ZJ-n5l
z#&)Ynwbzq7hKm}+YFwMeqK2jn99p|X>>6-U`9d?^Fvm@z++%zo{Ak}gIhWgap7tcH
zJK^4!#}ZXLwaU?0QU3w`8k~B(@?h+3Av<!B{ZTPTlO8rE#)QDb=F8()pIE}cZk&^{
zs(e{bOCEy?{*b788g8?46SJPKJQcHJ4gH5=@10u#v?d=`o(zX>IL2W>iZv1fx_8_{
z+3&0{XTCZ1_`sNclbqzHEyf-kDg79Iwdb(|_&$i>nDLphz-PvSqa`@}vxJ8(XF0}(
z<}9HBmQv9cvqq^yE@K=Fz<7bf^)F$x^Q6yNu5Hi#+Cxp-egP?y5Od^zIZAP9EFbR`
zh?61`nkEkn#>yl}PbUsR2Zm^&64C4r?yj?M{g5Y@f;Qumofs6<U=&XN2LL+@(8p@M
zg|L{r7vR+s!1Z7h)3-Jbd}!~#LwE+j4hZ)UX-=w2^-gqHDNjD=6!8gL;xo>s-5no=
zc-tw8KQBebHw>1EK4+<BtbsZ9kyYsR8g=<#uztC=ug(hfv7@D^K=#<t<3O}+=6J~F
zW}!#yu}w>11ws4FhXigphJFK!+1uXSMwl8tXEE$0Eb=Wr8Ct$K0ahgXioujI-Nlp7
zq%~-XI%_U$RWzf{>P;-n=Ucd8y`0Bk<XVpQLCn&&U5n07W-~ACpR|qMKpo>ZgNoQ}
z@4h-%f3Tm-Js_?IIFC}RL98B@0i1Otw(x`xwo~Ve_ex#5PSIj9opJGwAFC#dw#QlF
z9^CcuJmod$kmWF2o`Z%lukqn7Y@6vZ!}F_A?5|a@1f_-+Z0^u~zZx}Va7Vu3%d;0U
z7Tbec^NMs^!rb^6=3uo5U)r|AU)yuqwztk(Mp!UWqsN_qxKZT9p^)zrM~H!w_Wl-7
zt6|{$4?b|f-d~5coU?@izvsYyV?Y0UehMNw>;z7j**Um3TGztuPoBlsxa|j#)9?&C
z^@HMcWrF1N3iGrQAGO%;{b%s24|cX<>n?+!H?YT-bguUfeQ0g8{BcC%W_;OkEy$W{
zAlWg=PB|`GEJm6(_V6IsJ~9<g8otUe^=d8tr#$@~ZX6f!L<+S)<b$rMj*pJ%hha=7
zi(n)80ZfzeQkRvFJd5@g^F)<s(ilF03*htur@^(?GzLDsHR;1W&ihdKO`%;5%(0e&
z@{v5T(we<ak9;Lj$CBHxYg%@bgU#;>`lCUIKNMXRj5wzC2BXs)VaSBS(_GBOJk8Cm
z^R*VvN}?r!k|>Mne!a9LTIOi?cXkEaTm{S%cJzj0j#d7U?$E=I@ZG`4sz|3EbU-C@
z+4AnznBLhHbp_bGo?u88bF@Ph@ZPV(^`??G&;s?;&TFdoHn0dQxX{6B>KeRFtggw!
zN|t=TZ&_bSR~rk2V<DY2`n$Tq0i`LeVnNW~<_Jb2pgJ6Kv~@;x$F!2@Gz`PdwVJQa
zF>g`bf_mS)#V*H!x)}==HaQl1Tezcf-hw(uE%*973+kF&Abg%w?w)!5tl2l*SW)R;
z9%u`;ujuH!Yh_n=DBSZGk*FTKd)4aRdmLI#t*5TOfqQSc)z{cGZ~pfdG`B2VwD`8$
z@0fm$>W13dn;fMJmqWW^4!7Gi%T+$Z9TSi4wKHeB%4d~HEQ(@`#(0H0vxcUH4C2xt
zGF&S#6`40?xn}{htS+=nnLH^U_w&WT|6IPi4DWTe!g$VnTRvnf<PWjf7u4GdlQ}iE
z;?235&9Tv{*-GPi4Yol`PG5no6p&^suCW!?*z&cDIF!+`AS7HS7?iya%ifz?V{;^}
zHMY{td77<!;{}?nB0kAuyEA8Ffvo}vXaxz7RH|4h6d!~irh|9^8#m@aW$|2(tvu&}
z0$Z_WE7Y)BVe>JRfph{d6WNTl{rDV{h5t&?R!Doyn_yLv{YbKExX+jDQE)WYu^H{z
zh<=G%!6kV)OKkh|YHSDdg81-$fp893DVzgEJn(x^>PSHTfAr7%oX|w>nFU4vOX}%C
zJ-3XvC8t?92df?4`*J;+w=s8f4s(KN5!w!E5niZI9XVd*(%axwm6Ur82h;icvB9=J
zFDGu>1YLW7-aYV;yDcYMx3Y3jhlf&3U*ILfkJwExvT#_XJm&+kmt`5@zw_GzDsop@
zCUP9scb<eUwX)xjO(c?wySj`t8J;a{Qi^J+9L~WsS6Iw7F<TWe?^DDrk6kn#GlzJ_
zSvfwHh{rrJN9Y?|l`tBZ_w*Y)w<EtkjYJN|5%QbFJ$<PZ**u1Rr|G-mA&=cfA>?Z%
zk7aO-2dMwi4BjO9PcnFm<Uh~gcS!y~2ESAC$1-@k<d0|YF3F$G;1S8i@<`sTmi#bb
zm|ZKLFUX*b^#l0^87F3cGj<>F!gHh<dsxaVIT3Gxp}!`QKMF%|BL34X@=s>rgTT?w
zLvY6=Vl~|N!fzt|FJ<9JvT*#&gZfX&DRU5>F?|HTiS*;a0_FF~Ier3#A^j146Ul#>
zh2!B7^*k#VKo!sPB=5t;5t54OlO?|j7f?tduEYIO;1kt5B?~Xh!d+Q-VHQ3&3&(F-
z6WP<4g)hp&16lZ8S@`NKd`%V}2af*VB}1QB<G`PZEb?2j@W-T{TV&|#!Ons7L>776
zaAUnYr2HaNoV_S{A)J0NiM0&wZ)DN)P8R-t7XEP-emo2RA`8!haX(T2PR_!=2^`z|
zYg~XJiFFk2S7edDM#{e-j>YLcNO>0d8?$h_efM|ubog6Cv2Nz_)t2MMjhC5txgIaG
z@G=`OH{j()yj0+&5-)Bc?G{3AA>}re2nBAT!7U29MM<|P>aJwL(A}+paHzd=1zQo+
z;fcW+r8N|0t*t$g&XC^T8t7QbdSZGM2pmrWD-pEf`GomfBcV1HT^U@(+M?0ch(FX8
z?q<#K?1h6wPbbJm^#~klB7vS>7E}*i!4R?#!dU38SXZY%#PsD|tzkIAFda7jqApTB
z9*dxQIDLe|t#CT=>z&~cYi|<;I^bXujKJxoE7l$IM}(PJp)foO>LSS6iMrtGPMyIS
zDG+4-6)S{UJ;6wKOb@o=xkeZa6+)Xr0XV${x_dxDK=17dw(8-oFr4Jj2z;+zN5BTN
zY+GxOU+)0z*cND|9$v}X!LxX_iv&8*l+I{4P+8fEr9<tkTb#F8d!Q>E4YKZ)t^R1B
z&995B-q{^w(V&iw2huK1e$2l-9MN%{%4rCDEXFQ}-&gQ;2~#!XF~P8d-9nn*f)EmN
zTy2$nD?&)92hTBzo9nyDzfJfrAdd;<uYjMDe-$Al<X6H^@f?_^n6Tcd@Kd~ia13#Z
zPn8_?Tt&Et$m2UDZxH#b2|q?S)$=*w)XphrAf!w?uO^(@Suc5}oxOzP-cs55G~v|F
zVZy0BrwDftJr~OL80&QsehJ~z2%jQ3s+&r9DdDtTv$E)!Pxw_tPmu6Z!Z#64?fF;2
zuO{;UPB`_$V!59{`==B6Zo+Y?QhwG6$2p>SujJ^rKO_9_h<pLzPZ0STgzq98KWQla
z`w7Q0g5p=mtu5ADPIxKdZo+St9NRk+eqWl?daRCcJbNg<gmCJgpA!9eHc|HMB=UGh
zQv5~2XA}Mg(Q^ag?-G6^;lCw%@cW0d^Aa3nkg#32)T(+*B}e~IKbI4ZXF4TcOE|5!
zgK+BS2;tPv_eqZS;8L#iKTNod@E;IP{qr-TpZaGj;ndFO2&d!xE5h*%sq86~`!Mv+
z&4d>dK8^61S@@lV&mr<JW#RR5fkFRN6Zr*%+X-JRIohca-b>_hU#8l%p72Wu-%L1~
zrsV&Y@NW>lMRKf{&g)+gc{;BT5P3S@4ib6VuD6IhZPythPsfi1D}#i7qvPjd!s+<=
zmgHzZ9Y0eEr{f1c3yYLFZf6r-4HC+4RfJP}IN{Wu1(Ijlvy^aZPY2QSO;T?+k*^_q
z6_GC@ypPD^w-IIkCL&MuKS1QE{>KQXdUg?BOY|Qm+(Y;$S$L8Bu!R1pBJvf4*AYHn
za_p~5313L$shwSf*Aw}_CcJ_0M~Hqp4j(7{G9v#Z;k4hMA^Pb!+(|g?_m>E#@#iSv
z)c%Vx=tDxkmB3H+@6D2Dj^`S}sr^laQ~Pft`l<cP38(h2A)Na2--!Ni5j%GfPV3!G
z^w4@=CY<U&LO8YO9m46lV8Kp>gnpy`oRWo?6YhmF%FlBNr~Z#gp6UNS!m0l^5l;Q^
z0MSqV@Cf0wy+0v(Fstl*if~%*^MuoSha^Y;(0X4YoYs4gaO&sFF*rg(KTL<8vS)1;
z{t)5Ro*xljO!Pb<Iof$E;ZG6nBfJ3@AxN3_I0Q90zEcTf5hi5#L*HRCzq_Wn`cApq
ze4Ub4-xp70aP=K=x;!^2{p!15bp}`81@UiZOiEsT7sT&Wl&kMs?`3fHy{Z@$KvH_t
zcd6<OuD(k>lEKw?or4)%eZOI@Xm7XfUk-OTu#0<#%0+@*ewQ9xtuq&#%Kgl>JQ`)L
zNEjbb-0gxdFZ2;UteB7L?g~V8fPrv#cMv{{xI$q)=z_l&XNa#U@_9ul=IV*SM}vso
z3k9*#U`K0v#19_;{@=*~8ql?JTb|NaEpr`moT(SG8`LAiy*G~8>J%|Eg?RXfG%Dv4
z4r44Ux0B+3QG9=m<nnljNTeTU)T?-A{Wup<=l``l?)gzCo~39~^&h7Y@~VDyua+#c
z{1)8ILi&z*OI=m_Rrz`F2J>?GrR3g|<+T(sGmebs#qf;fanDOrDQ=n}srt;P^t0k^
z@B+)1D*<^gm*vsFG@&deb?+v5@j_mf%JP(9UX@dP4P++4FSGr3%JL2+DDTQ{g&%?p
z>dY*^UY0M;7#Nx5?}xGzjo*CPaP>blj^D}fQ{^#sW16UL<Xdp_3aJ7c2MLEWo<B0n
n<1rz_L?G}#JWeb~b21v1=|61dMCEUrA&Rrd%v<W3S^mEP0%tD-

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/build.c b/data/ssb/dbgen/build.c
new file mode 100644
index 0000000..fd6cc87
--- /dev/null
+++ b/data/ssb/dbgen/build.c
@@ -0,0 +1,800 @@
+/* @(#)build.c	2.1.8.1 */
+/* Sccsid:     @(#)build.c	9.1.1.17     11/15/95  12:52:28 */
+/* stuff related to the customer table */
+#include <stdio.h>
+#include <string.h>
+#ifdef SSBM
+#include <time.h>
+#endif
+#ifndef VMS
+#include <sys/types.h>
+#endif
+#if defined(SUN)
+#include <unistd.h>
+#endif
+
+#if defined(LINUX)
+#include <unistd.h>
+#endif
+
+#include <math.h>
+
+#include "dss.h"
+#include "dsstypes.h"
+#include "bcd2.h"
+#ifdef ADHOC
+#include "adhoc.h"
+extern adhoc_t adhocs[];
+#endif /* ADHOC */
+
+#define LEAP_ADJ(yr, mnth)      \
+((LEAP(yr) && (mnth) >= 2) ? 1 : 0)
+#define JDAY_BASE       8035    /* start from 1/1/70 a la unix */
+#define JMNTH_BASE      (-70 * 12) /* start from 1/1/70 a la unix */
+#define JDAY(date) ((date) - STARTDATE + JDAY_BASE + 1)
+#define PART_SUPP_BRIDGE(tgt, p, s) \
+    { \
+    long tot_scnt = tdefs[SUPP].base * scale; \
+    tgt = (p + s *  (tot_scnt / SUPP_PER_PART +  \
+	(long) ((p - 1) / tot_scnt))) % tot_scnt + 1; \
+    }
+#define RPRICE_BRIDGE(tgt, p) tgt = rpb_routine(p)
+#define V_STR(avg, sd, tgt)  a_rnd((int)(avg * V_STR_LOW), \
+(int)(avg * V_STR_HGH), sd, tgt)
+#define TEXT(avg, sd, tgt)  \
+dbg_text(tgt, (int)(avg * V_STR_LOW),(int)(avg * V_STR_HGH), sd)
+static void gen_phone PROTO((long ind, char *target, long seed));
+
+#ifdef SSBM
+static void gen_category PROTO((char *target, long seed));
+int gen_city PROTO((char *cityName, char *nationName));
+int gen_season PROTO((char * dest,int month,int day));
+int is_last_day_in_month PROTO((int year,int month,int day));
+int gen_holiday_fl PROTO((char * dest, int month, int day));
+int gen_city PROTO((char *cityName, char *nationName));
+int gen_color PROTO((char * source, char * dest));
+#endif
+
+
+long
+rpb_routine(long p)
+	{
+	long price;	
+	price = 90000;
+	price += (p/10) % 20001;        /* limit contribution to $200 */
+	price += (p % 1000) * 100;
+	
+	return(price);
+	}
+
+static void 
+gen_phone(long ind, char *target, long seed)
+	{
+    long	acode,
+		exchg,
+		number;
+	
+    RANDOM(acode, 100, 999, seed);
+    RANDOM(exchg, 100, 999, seed);
+    RANDOM(number, 1000, 9999, seed);	
+    sprintf(target, "%02d", 10 + (ind % NATIONS_MAX));
+    sprintf(target + 3, "%03d", acode);
+    sprintf(target + 7, "%03d", exchg);
+    sprintf(target + 11, "%04d", number);
+    target[2] = target[6] = target[10] = '-';	
+    return;
+}
+
+static void
+gen_category(char *target, long seed){
+  long num1,num2;
+  RANDOM(num1,1,5,seed);
+  RANDOM(num2,1,5,seed);
+  strcpy(target,"MFGR");
+  sprintf(target + 4, "%01d", num1);
+  sprintf(target + 5, "%01d", num2);
+  return;
+} 
+
+#ifdef SSBM
+long mk_cust(long n_cust, customer_t *c)
+{
+        long i;
+        c->custkey = n_cust;
+	sprintf(c->name, C_NAME_FMT, C_NAME_TAG, n_cust);
+	c->alen = V_STR(C_ADDR_LEN, C_ADDR_SD, c->address);
+    RANDOM(i, 0, nations.count-1, C_NTRG_SD);
+	strcpy(c->nation_name,nations.list[i].text);
+	strcpy(c->region_name,regions.list[nations.list[i].weight].text);
+	gen_city(c->city,c->nation_name);
+	gen_phone(i, c->phone, (long)C_PHNE_SD);
+        pick_str(&c_mseg_set, C_MSEG_SD, c->mktsegment);
+	return (0);
+ }
+
+#else
+long 
+mk_cust(long n_cust, customer_t *c)
+	{
+	long i;
+	
+	c->custkey = n_cust;
+	sprintf(c->name, C_NAME_FMT, C_NAME_TAG, n_cust);
+	c->alen = V_STR(C_ADDR_LEN, C_ADDR_SD, c->address);
+	RANDOM(i, 0, (nations.count - 1), C_NTRG_SD);
+	c->nation_code = i;
+	gen_phone(i, c->phone, (long)C_PHNE_SD);
+	RANDOM(c->acctbal, C_ABAL_MIN, C_ABAL_MAX, C_ABAL_SD);
+	pick_str(&c_mseg_set, C_MSEG_SD, c->mktsegment);
+	c->clen = TEXT(C_CMNT_LEN, C_CMNT_SD, c->comment);
+	
+	return (0);
+	}
+#endif
+
+	/*
+	* generate the numbered order and its associated lineitems
+*/
+void
+mk_sparse (long i, DSS_HUGE *ok, long seq)
+	{
+#ifndef SUPPORT_64BITS
+	if (scale < MAX_32B_SCALE)
+#endif
+		ez_sparse(i, ok, seq);
+#ifndef SUPPORT_64BITS
+	else
+		hd_sparse(i, ok, seq);
+#endif
+	return;
+	}
+
+	/*
+	* the "simple" version of mk_sparse, used on systems with 64b support
+	* and on all systems at SF <= 300G where 32b support is sufficient
+*/
+void
+ez_sparse(long i, DSS_HUGE *ok, long seq)
+	{
+	long low_bits;
+	
+	LONG2HUGE(i, ok);
+	low_bits = (long)(i & ((1 << SPARSE_KEEP) - 1));
+	*ok = *ok >> SPARSE_KEEP;
+	*ok = *ok << SPARSE_BITS;
+	*ok += seq;
+	*ok = *ok << SPARSE_KEEP;
+	*ok += low_bits;
+	
+	
+	return;
+	}
+
+#ifndef SUPPORT_64BITS
+void
+hd_sparse(long i, DSS_HUGE *ok, long seq)
+	{
+	long low_mask, seq_mask;
+	static int init = 0;
+	static DSS_HUGE *base, *res;
+	
+	if (init == 0)
+		{
+		INIT_HUGE(base);
+		INIT_HUGE(res);
+		init = 1;
+		}
+	
+	low_mask = (1 << SPARSE_KEEP) - 1;
+	seq_mask = (1 << SPARSE_BITS) - 1;
+	bin_bcd2(i, base, base + 1);
+	HUGE_SET (base, res);
+	HUGE_DIV (res, 1 << SPARSE_KEEP);
+	HUGE_MUL (res, 1 << SPARSE_BITS);
+	HUGE_ADD (res, seq, res);
+	HUGE_MUL (res, 1 << SPARSE_KEEP);
+	HUGE_ADD (res, *base & low_mask, res);
+	bcd2_bin (&low_mask, *res);
+	bcd2_bin (&seq_mask, *(res + 1));
+	*ok = low_mask;
+	*(ok + 1) = seq_mask;
+	return;
+	}
+#endif
+
+#ifdef SSBM
+long
+mk_order(long index, order_t *o, long upd_num)
+	{
+	long      lcnt;
+	long      rprice;
+	long      ocnt;
+	long      tmp_date;
+	long      c_date;
+	long      clk_num;
+	long      supp_num;
+	static char **asc_date = NULL;
+	char tmp_str[2];
+	char **mk_ascdate PROTO((void));
+	int delta = 1;
+
+	if (asc_date == NULL)
+	    asc_date = mk_ascdate();
+
+	RANDOM(tmp_date, O_ODATE_MIN, O_ODATE_MAX, O_ODATE_SD);
+	strcpy(o->odate, asc_date[tmp_date - STARTDATE]);
+
+	mk_sparse (index, o->okey,
+		(upd_num == 0) ? 0 : 1 + upd_num / (10000 / refresh));
+	RANDOM(o->custkey, O_CKEY_MIN, O_CKEY_MAX, O_CKEY_SD);
+	while (o->custkey % CUST_MORTALITY == 0)
+	    {
+		o->custkey += delta;
+		o->custkey = MIN(o->custkey, O_CKEY_MAX);
+		delta *= -1;
+	    }
+	pick_str(&o_priority_set, O_PRIO_SD, o->opriority);
+	RANDOM(clk_num, 1, MAX((scale * O_CLRK_SCL), O_CLRK_SCL), O_CLRK_SD);
+	o->spriority = 0;
+	
+	o->totalprice = 0;
+	ocnt = 0;
+	
+	RANDOM(o->lines, O_LCNT_MIN, O_LCNT_MAX, O_LCNT_SD);
+	for (lcnt = 0; lcnt < o->lines; lcnt++)
+	    {
+		
+		HUGE_SET(o->okey, o->lineorders[lcnt].okey);
+		o->lineorders[lcnt].linenumber = lcnt + 1;
+		o->lineorders[lcnt].custkey = o->custkey;
+		RANDOM(o->lineorders[lcnt].partkey, L_PKEY_MIN, L_PKEY_MAX, L_PKEY_SD);
+		RANDOM(o->lineorders[lcnt].suppkey, L_SKEY_MIN, L_SKEY_MAX, L_SKEY_SD);
+				
+		RANDOM(o->lineorders[lcnt].quantity, L_QTY_MIN, L_QTY_MAX, L_QTY_SD);
+		RANDOM(o->lineorders[lcnt].discount, L_DCNT_MIN, L_DCNT_MAX, L_DCNT_SD);
+		RANDOM(o->lineorders[lcnt].tax, L_TAX_MIN, L_TAX_MAX, L_TAX_SD);
+
+		strcpy(o->lineorders[lcnt].orderdate,o->odate);
+
+		strcpy(o->lineorders[lcnt].opriority,o->opriority);
+		o->lineorders[lcnt].ship_priority = o->spriority;
+
+		RANDOM(c_date, L_CDTE_MIN, L_CDTE_MAX, L_CDTE_SD);
+		c_date += tmp_date;        
+		strcpy(o->lineorders[lcnt].commit_date, asc_date[c_date - STARTDATE]);
+
+		pick_str(&l_smode_set, L_SMODE_SD, o->lineorders[lcnt].shipmode);
+		
+		RPRICE_BRIDGE( rprice, o->lineorders[lcnt].partkey);
+		o->lineorders[lcnt].extended_price = rprice * o->lineorders[lcnt].quantity;
+		o->lineorders[lcnt].revenue = o->lineorders[lcnt].extended_price * ((long)100-o->lineorders[lcnt].discount)/(long)PENNIES;
+		
+		//round off problem with linux if use 0.6
+		o->lineorders[lcnt].supp_cost = 6 * rprice /10;
+		
+		o->totalprice +=
+		    ((o->lineorders[lcnt].extended_price * 
+		      ((long)100 - o->lineorders[lcnt].discount)) / (long)PENNIES ) *
+		    ((long)100 + o->lineorders[lcnt].tax)
+		    / (long)PENNIES;
+	    }
+	
+	for (lcnt = 0; lcnt < o->lines; lcnt++)
+	    {
+		o->lineorders[lcnt].order_totalprice = o->totalprice;
+	    }
+	return (0);
+	}
+#else
+long
+mk_order(long index, order_t *o, long upd_num)
+	{
+	long      lcnt;
+	long      rprice;
+	long      ocnt;
+	long      tmp_date;
+	long      s_date;
+	long      r_date;
+	long      c_date;
+	long      clk_num;
+	long      supp_num;
+	static char **asc_date = NULL;
+	char tmp_str[2];
+	char **mk_ascdate PROTO((void));
+	int delta = 1;
+	
+	if (asc_date == NULL)
+        asc_date = mk_ascdate();
+	mk_sparse (index, o->okey,
+		(upd_num == 0) ? 0 : 1 + upd_num / (10000 / refresh));
+    RANDOM(o->custkey, O_CKEY_MIN, O_CKEY_MAX, O_CKEY_SD);
+    while (o->custkey % CUST_MORTALITY == 0)
+		{
+		o->custkey += delta;
+		o->custkey = MIN(o->custkey, O_CKEY_MAX);
+		delta *= -1;
+		}
+	
+	
+    RANDOM(tmp_date, O_ODATE_MIN, O_ODATE_MAX, O_ODATE_SD);
+    strcpy(o->odate, asc_date[tmp_date - STARTDATE]);
+	
+    pick_str(&o_priority_set, O_PRIO_SD, o->opriority);
+	RANDOM(clk_num, 1, MAX((scale * O_CLRK_SCL), O_CLRK_SCL), O_CLRK_SD);
+    sprintf(o->clerk, O_CLRK_FMT,
+        O_CLRK_TAG,
+        clk_num);
+    o->clen = TEXT(O_CMNT_LEN, O_CMNT_SD, o->comment);
+#ifdef DEBUG
+	if (o->clen > O_CMNT_MAX) fprintf(stderr, "comment error: O%d\n", index);
+#endif /* DEBUG */
+    o->spriority = 0;
+	
+    o->totalprice = 0;
+    o->orderstatus = 'O';
+    ocnt = 0;
+	
+	RANDOM(o->lines, O_LCNT_MIN, O_LCNT_MAX, O_LCNT_SD);
+    for (lcnt = 0; lcnt < o->lines; lcnt++)
+		{
+        HUGE_SET(o->okey, o->l[lcnt].okey);
+        o->l[lcnt].lcnt = lcnt + 1;
+	RANDOM(o->l[lcnt].quantity, L_QTY_MIN, L_QTY_MAX, L_QTY_SD);
+	RANDOM(o->l[lcnt].discount, L_DCNT_MIN, L_DCNT_MAX, L_DCNT_SD);
+        RANDOM(o->l[lcnt].tax, L_TAX_MIN, L_TAX_MAX, L_TAX_SD);
+        pick_str(&l_instruct_set, L_SHIP_SD, o->l[lcnt].shipinstruct);
+        pick_str(&l_smode_set, L_SMODE_SD, o->l[lcnt].shipmode);
+        o->l[lcnt].clen = TEXT(L_CMNT_LEN, L_CMNT_SD, o->l[lcnt].comment);
+        RANDOM(o->l[lcnt].partkey, L_PKEY_MIN, L_PKEY_MAX, L_PKEY_SD);
+        RPRICE_BRIDGE( rprice, o->l[lcnt].partkey);
+        RANDOM(supp_num, 0, 3, L_SKEY_SD);
+        PART_SUPP_BRIDGE( o->l[lcnt].suppkey, o->l[lcnt].partkey, supp_num);
+        o->l[lcnt].eprice = rprice * o->l[lcnt].quantity;
+		
+        o->totalprice +=
+            ((o->l[lcnt].eprice * 
+            ((long)100 - o->l[lcnt].discount)) / (long)PENNIES ) *
+            ((long)100 + o->l[lcnt].tax)
+            / (long)PENNIES;
+		
+		RANDOM(s_date, L_SDTE_MIN, L_SDTE_MAX, L_SDTE_SD);
+        s_date += tmp_date;
+		RANDOM(c_date, L_CDTE_MIN, L_CDTE_MAX, L_CDTE_SD);
+        c_date += tmp_date;
+		RANDOM(r_date, L_RDTE_MIN, L_RDTE_MAX, L_RDTE_SD);
+        r_date += s_date;
+		
+        
+        strcpy(o->l[lcnt].sdate, asc_date[s_date - STARTDATE]);
+        strcpy(o->l[lcnt].cdate, asc_date[c_date - STARTDATE]);
+        strcpy(o->l[lcnt].rdate, asc_date[r_date - STARTDATE]);
+		
+		
+        if (julian(r_date) <= CURRENTDATE) 
+			{
+            pick_str(&l_rflag_set, L_RFLG_SD, tmp_str);
+            o->l[lcnt].rflag[0] = *tmp_str;
+			}
+        else 
+            o->l[lcnt].rflag[0] = 'N';
+		
+        if (julian(s_date) <= CURRENTDATE) 
+			{
+            ocnt++;
+            o->l[lcnt].lstatus[0] = 'F';
+			}
+        else 
+            o->l[lcnt].lstatus[0] = 'O';
+		}
+	
+    if (ocnt > 0)
+        o->orderstatus = 'P';
+    if (ocnt == o->lines)
+        o->orderstatus = 'F';
+	
+	return (0);
+}
+#endif
+
+#ifdef SSBM
+long mk_part(long index, part_t *p)
+{
+	long      mfgr,cat,brnd;
+	
+	p->partkey = index;
+	
+	agg_str(&colors, (long)P_NAME_SCL, (long)P_NAME_SD, p->name);
+	
+	/*extract color from substring of p->name*/
+    p->clen =gen_color(p->name,p->color); 
+	
+	
+	RANDOM(mfgr, P_MFG_MIN, P_MFG_MAX, P_MFG_SD);
+	sprintf(p->mfgr, "%s%d", "MFGR#", mfgr);
+
+    RANDOM(cat, P_CAT_MIN, P_CAT_MAX, P_CAT_SD);
+    sprintf(p->category, "%s%d", p->mfgr,cat);
+	
+
+	RANDOM(brnd, P_BRND_MIN, P_BRND_MAX, P_BRND_SD);
+	sprintf(p->brand,"%s%d",p->category,brnd);
+
+	p->tlen = pick_str(&p_types_set, P_TYPE_SD, p->type);
+	p->tlen = strlen(p_types_set.list[p->tlen].text);
+	RANDOM(p->size, P_SIZE_MIN, P_SIZE_MAX, P_SIZE_SD);
+	
+	pick_str(&p_cntr_set, P_CNTR_SD, p->container);
+	
+	
+	return (0);
+}
+#else
+long
+mk_part(long index, part_t *p)
+	{
+	long      temp;
+	long      snum;
+	long      brnd;
+	
+	p->partkey = index;
+	agg_str(&colors, (long)P_NAME_SCL, (long)P_NAME_SD, p->name); 
+	RANDOM(temp, P_MFG_MIN, P_MFG_MAX, P_MFG_SD);
+	sprintf(p->mfgr, P_MFG_FMT, P_MFG_TAG, temp);
+	RANDOM(brnd, P_BRND_MIN, P_BRND_MAX, P_BRND_SD);
+	sprintf(p->brand, P_BRND_FMT,
+		P_BRND_TAG,
+		(temp * 10 + brnd));
+	p->tlen = pick_str(&p_types_set, P_TYPE_SD, p->type);
+	p->tlen = strlen(p_types_set.list[p->tlen].text);
+	RANDOM(p->size, P_SIZE_MIN, P_SIZE_MAX, P_SIZE_SD);
+	pick_str(&p_cntr_set, P_CNTR_SD, p->container);
+	RPRICE_BRIDGE( p->retailprice, index);
+	p->clen = TEXT(P_CMNT_LEN, P_CMNT_SD, p->comment);
+	
+	for (snum = 0; snum < SUPP_PER_PART; snum++)
+		{
+		p->s[snum].partkey = p->partkey;
+		PART_SUPP_BRIDGE( p->s[snum].suppkey, index, snum);
+		RANDOM(p->s[snum].qty, PS_QTY_MIN, PS_QTY_MAX, PS_QTY_SD);
+		RANDOM(p->s[snum].scost, PS_SCST_MIN, PS_SCST_MAX, PS_SCST_SD);
+		p->s[snum].clen = TEXT(PS_CMNT_LEN, PS_CMNT_SD, p->s[snum].comment);
+		}
+	return (0);
+	}
+#endif
+
+
+#ifdef SSBM
+long
+mk_supp(long index, supplier_t *s)
+{
+	long     i,
+		bad_press,
+		noise,
+		offset,
+		type;
+        s->suppkey = index;
+	sprintf(s->name, S_NAME_FMT, S_NAME_TAG, index); 
+	s->alen = V_STR(S_ADDR_LEN, S_ADDR_SD, s->address);
+	RANDOM(i, 0, nations.count-1, S_NTRG_SD);
+	strcpy(s->nation_name,nations.list[i].text);
+        strcpy(s->region_name,regions.list[nations.list[i].weight].text);
+	gen_city(s->city,s->nation_name);
+	gen_phone(i, s->phone, (long)C_PHNE_SD);
+	return (0);
+}
+#else
+long
+mk_supp(long index, supplier_t *s)
+	{
+	long     i,
+		bad_press,
+		noise,
+		offset,
+		type;
+	
+	s->suppkey = index;
+	sprintf(s->name, S_NAME_FMT, S_NAME_TAG, index); 
+	s->alen = V_STR(S_ADDR_LEN, S_ADDR_SD, s->address);
+	RANDOM(i, 0, nations.count - 1, S_NTRG_SD);
+	s->nation_code= i;
+	gen_phone(i, s->phone, S_PHNE_SD);
+	RANDOM(s->acctbal, S_ABAL_MIN, S_ABAL_MAX, S_ABAL_SD);
+	
+	s->clen = TEXT(S_CMNT_LEN, S_CMNT_SD, s->comment);
+	/* these calls should really move inside the if stmt below, 
+	* but this will simplify seedless parallel load 
+	*/
+	RANDOM(bad_press, 1, 10000, BBB_CMNT_SD);
+	RANDOM(type, 0, 100, BBB_TYPE_SD);
+	RANDOM(noise, 0, (s->clen - BBB_CMNT_LEN), BBB_JNK_SD);
+	RANDOM(offset, 0, (s->clen - (BBB_CMNT_LEN + noise)),
+		BBB_OFFSET_SD);
+	if (bad_press <= S_CMNT_BBB)
+		{
+		type = (type < BBB_DEADBEATS) ?0:1;
+        memcpy(s->comment + offset, BBB_BASE, BBB_BASE_LEN);
+        if (type == 0)
+  			memcpy(s->comment + BBB_BASE_LEN + offset + noise, 
+			BBB_COMPLAIN, BBB_TYPE_LEN); 
+        else
+			memcpy(s->comment + BBB_BASE_LEN + offset + noise, 
+			BBB_COMMEND, BBB_TYPE_LEN); 
+		}
+	
+	return (0);
+	}
+#endif
+
+struct
+	{	
+	char     *mdes;
+	long      days;
+	long      dcnt;
+	}         
+months[] =
+		
+	{
+		{NULL, 0, 0},
+		{"JAN", 31, 31},
+		{"FEB", 28, 59},
+		{"MAR", 31, 90},
+		{"APR", 30, 120},
+		{"MAY", 31, 151},
+		{"JUN", 30, 181},
+		{"JUL", 31, 212},
+		{"AUG", 31, 243},
+		{"SEP", 30, 273},
+		{"OCT", 31, 304},
+		{"NOV", 30, 334},
+		{"DEC", 31, 365}
+		};
+	
+long
+mk_time(long index, dss_time_t *t)
+	{
+		long      m = 0;
+		long      y;
+		long      d;
+		
+		t->timekey = index + JDAY_BASE;
+		y = julian(index + STARTDATE - 1) / 1000;
+		d = julian(index + STARTDATE - 1) % 1000;
+		while (d > months[m].dcnt + LEAP_ADJ(y, m))
+			m++;
+		PR_DATE(t->alpha, y, m,
+			d - months[m - 1].dcnt - ((LEAP(y) && m > 2) ? 1 : 0));
+		t->year = 1900 + y;
+		t->month = m + 12 * y + JMNTH_BASE;
+		t->week = (d + T_START_DAY - 1) / 7 + 1;
+		t->day = d - months[m - 1].dcnt - LEAP_ADJ(y, m-1);
+		
+		return (0);
+		}
+	
+	int 
+		mk_nation(long index, code_t *c)
+		{
+		c->code = index - 1;
+		c->text = nations.list[index - 1].text;
+	c->join = nations.list[index - 1].weight;
+		c->clen = TEXT(N_CMNT_LEN, N_CMNT_SD, c->comment);
+		return(0);
+		}
+	
+	int 
+		mk_region(long index, code_t *c)
+		{
+		
+		c->code = index - 1;
+		c->text = regions.list[index - 1].text;
+		c->join = 0;        /* for completeness */
+		c->clen = TEXT(R_CMNT_LEN, R_CMNT_SD, c->comment);
+		return(0);
+		}
+
+
+#ifdef SSBM
+		/*bug!*/
+int gen_city(char *cityName, char *nationName){
+    int i=0;
+    long randomPick;
+	int clen = strlen(cityName);
+	int nlen = strlen(nationName);
+
+    strncpy(cityName,nationName,CITY_FIX-1);
+    
+    if(nlen < CITY_FIX-1){
+      for(i = nlen ; i< CITY_FIX-1;i++)
+        cityName[i] = ' ';
+    }
+    RANDOM(randomPick, 0, 9, 98);
+    
+    sprintf(cityName+CITY_FIX-1,"%d",randomPick);
+    cityName[CITY_FIX] = '\0';
+    return 0; 
+}
+
+
+/*
+P_NAME is as long as 55 bytes in TPC-H, which is un�reasonably large. 
+We reduce it to 22 by limiting to a concatena�tion of two colors (see [TPC-H], pg 94). 
+We also add a new column named P_COLOR that could be used in queries where currently a 
+color must be chosen by substring from P_NAME.
+*/
+int gen_color(char * source, char * dest){
+  int i = 0,j=0;
+  int clen=0;
+
+  while(source[i]!= ' '  ){
+      dest[i]=source[i];
+      i++;
+  }
+  dest[i]='\0';
+
+  i++;
+  while(source[i] != '\0'){
+	source[j] = source[i];
+	j++;
+	i++;
+  }
+
+  source[j] = '\0';
+  
+  clen = strlen(dest);
+  return clen;
+}
+
+
+
+/*Following functions are related to date table generation*/
+int days_in_a_month[12]={31,28,31,30,31,30,31,31,30,31,30,31};
+int days_in_a_month_l[12]={31,29,31,30,31,30,31,31,30,31,30,31};
+season seasons[]={
+  {"Christmas",1,11,31,12},
+  {"Summer",1,5,31,8},
+  {"Winter",1,1,31,3},
+  {"Spring",1,4,30,4},
+  {"Fall",1,9,31,10}
+};
+holiday holidays[]={
+  {"Christmas",12,24},
+  {"New Years Day", 1,1},
+  {"holiday1", 2,20},
+  {"Easter Day",4,20},
+  {"holiday2", 5,20},
+  {"holiday3",7,20},
+  {"holiday4",8,20},
+  {"holiday5",9,20},
+  {"holiday6",10,20},
+  {"holiday7",11,20}
+};
+
+char * month_names[]={"January","February","March","April",
+                 "May","June","July","Augest",
+                 "September","Octorber","November","December"};
+
+char * weekday_names[]={"Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"};
+
+/*make the date table, it takes the continuous index , and add index*60*60*24 to 
+ *numeric representation 1/1/1992 01:01:01, 
+ *then convert the final numeric date time to tm structure, and thus extract other field
+ *for date_t structure */
+long
+mk_date(long index,date_t *d)
+{
+    long espan = (index-1)*60*60*24;
+    
+    time_t numDateTime = D_STARTDATE + espan; 
+
+    struct tm *localTime = localtime(&numDateTime); 
+ 
+    /*make Sunday be the first day of a week */
+    d->daynuminweek=((long)localTime->tm_wday+1)%7+1;
+    d->monthnuminyear=(long)localTime->tm_mon+1;
+    strncpy(d->dayofweek, weekday_names[d->daynuminweek-1],D_DAYWEEK_LEN+1);
+    strncpy(d->month,month_names[d->monthnuminyear-1],D_MONTH_LEN+1);
+    d->year=(long)localTime->tm_year + 1900;
+    d->daynuminmonth=(long)localTime->tm_mday;
+    d->yearmonthnum=d->year * 100 + d->monthnuminyear;
+
+    sprintf(d->yearmonth,"%.3s%d",d->month,d->year);
+    sprintf(d->date,"%s %d, %d",d->month,d->daynuminmonth,d->year);
+    
+    d->datekey = d->year*10000+d->monthnuminyear*100+ d->daynuminmonth; 
+
+    d->daynuminyear=(int)localTime->tm_yday+1;
+    d->weeknuminyear = d->daynuminyear/7 + 1;
+    
+    if(d->daynuminweek ==7){
+	d->lastdayinweekfl[0]='1';
+    }
+    else{
+	d->lastdayinweekfl[0]='0';
+    }
+    d->lastdayinweekfl[1]='\0';
+
+    if(is_last_day_in_month(d->year,d->monthnuminyear,d->daynuminmonth)==1){
+	d->lastdayinmonthfl[0]= '0';
+    }else{
+	d->lastdayinmonthfl[0]= '1';
+    }
+    d->lastdayinmonthfl[1]='\0';
+ 
+    if(d->daynuminweek!=1 && d->daynuminweek!=7){
+	d->weekdayfl[0]='1';
+    }
+    else{
+	d->weekdayfl[0]='0';
+    }
+    
+    d->weekdayfl[1]='\0';
+
+    gen_season(d->sellingseason,d->monthnuminyear,d->daynuminmonth);
+    d->slen = strlen(d->sellingseason);
+    gen_holiday_fl(d->holidayfl,d->monthnuminyear,d->daynuminmonth);    
+    return (0);
+}
+
+int gen_holiday_fl(char * dest, int month, int day){
+  int i;
+  for(i = 0; i< NUM_HOLIDAYS; i++){
+    if(holidays[i].month == month && holidays[i].day == day){
+      strcpy(dest,"1");
+      return 0;
+    }
+  }
+  strcpy(dest,"0");
+  return 0;
+}
+
+
+int
+is_last_day_in_month(int year,int month,int day){
+    int * days;
+    if(LEAP(year))
+      days = days_in_a_month_l;
+    else
+      days = days_in_a_month;
+    if(day ==  days[month-1]) return 1;
+    return 0;
+}
+
+int gen_season(char * dest,int month,int day)
+{
+    int i;
+    for(i =0;i<NUM_SEASONS;i++){
+	season *seas;
+	seas = &seasons[i];
+        
+	if(month>=seas->start_month && month<=seas->end_month && 
+	   day >= seas->start_day && day <= seas->end_day){
+	    strcpy(dest, seas->name);
+            return 0;
+        }
+    }
+    strcpy(dest,"");
+
+    return 0;
+}
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/data/ssb/dbgen/build.o b/data/ssb/dbgen/build.o
new file mode 100644
index 0000000000000000000000000000000000000000..f33ad4a89afbbae63d8f617f72fe59d7c8a73fd5
GIT binary patch
literal 23320
zcmdU13w%_?xj&l(vI5Pnh?@FbU{e!;xCw}fMa{Aa%*ui(NqG25mfa1Bd3GNNf*QLC
za$FW|eU#p6>#h2$R$Hx?TC%o~2Tf308?nCcRcckT;)9~{s>uDmIWxOoPI9XE@%wr2
z{5WT3{_~%2zWL@od&12Xo&^~ulOml-xl~EVB&sM|m!-pHJXoeoR!&x&)iqTt`hkt9
z$C>(xTm6fl#j^79fx*;rdu~zBL5p=9^6M>y+_&831NoM&Yv9e)eN25Ju`g3mcKCra
zjKt9yR`Lv0m?6HwZuY2u{n9K;cortISTT#`JSb%tisGfPM0D1x-@|`Q9cG8$XT8U=
zFw}eGGFH&Xn)Y1D3SRQWD<-h`H&81!*mnZl4v+|HzqK7PO;CJ&rYDwVXYpBxF*Ql4
zBY4@KSh?N&&lnStVI8B$w44FggjoZw@$h8KV!+Pzvl*JfEnOX`Ky;Vws%xBA_jh+f
zT|MgC+_F2-#@kVOP3f&n&9W!I|HX+DUq%CsllLMof4U|PO}G<{IZ_<xo%}iQxcpBn
z=Nx0|7c8;5UUN)6q8T}Z^`5v48l4p!Wr<4~9LL7R=bpjT-CF8I42dRa+%t6vRHxtt
zmiY7Gs=qz-<6SnU9?*E%tl$`H+=HsHf|pr5>nV6TSiJL=J8DfzKQDf8ff=d`4RQn|
zW<vV+qWiMJ8)WKu&X8T#t$q$&m#Hu5yKeLn6re;d-Dw3+Gn~phG3)BFbWQ_vj-7H5
zqzrN*TAU4n4YNhS3Mwlsb|Ji>H!p+EyiXGx(*(zOnJi{y>gg<22EApWnZ>U+C$KVt
zsZkKj8UtkN`=fZHGRxAeY<e|xNsy)G!STgpqC?0LnzaLOgWW1X-jg%22ZE|D$V9th
zWOb?=I=4T_tE!En_^A+4v{9s0m5bY%>Y2{%5B*o@JU4Id=m+L{%f~$jndN-0FH9r6
z$a>2KyVp$6l#cE)_w&&HPce^kxpSGbMs$O<FoU_(Pnr7Sh;GnO)WbwSuylqp#+sg5
z0kw2Bz#Hwuy0O^AK)XrV;azRg+RP0JQ<%z)TWx143vU}bvg^|m3Cy8s*!4#Eob6T^
zTt7OjwX$zFQ(t92Vuz11wVy@%O-y~=9h+lk>c>3C=-KiVuFf}5ofx+^vpmjyRKnby
zFYV8>yJH?`@Wiqj&541S9)$6%rL43!a-}E6jmaC%6W2P#&Qvbn05kQvRS@M-ce^c*
zci<AjN_R8MlHD9W{3e(DF&W;P<Z!<`UT&X?YXx+pz8&A<jnAuYP`!E(ZRx7R0vdU>
za1z+gXM)l`X7LPRBdqA2IHH<P7=jEC1p_#kQ$x`5_=P;$`(B1L<xU46>B&v)V_T!g
z##^=&VVR(49Rs7)u|%Hx2X_1?jP?S|c&W)teGz7wm$>ADAy&}Kz&Ir=dep?CZ=7K2
z)piH#dc(5i_3@yRHX1-N8}dO5Uj2iXlQ-D&lP$pDmSEb<liYCp#0jRg1~e>h72bb%
zXWNP0F0DF)X&x$`ypiY8I+W#c%~y{m@}W3hIL+aP9OmRZB!_xb7Q`A)5Ni*DwT~oH
zhNeQW*cHi*upGclnE<;f%u?Q`84sGLu6qVR37ivo`MiS3hmanH)a>vi^c^Nk=P#hE
zxxqA*$W5XNaH1(eB`a|fzI_cq$zpgNw5)(P`YW^SFbm9tj<EiY?o&pTA)7e88Tk$L
z9!uBbxR8t-Es%R__mnTmy^ZJo73PkwAL`BldZPGC^8A+P`L&j3xZ8qxCSV?Tw5A@d
zWrF9y(z>zt^2LDj<um}r-_N@Ni-qlSE?@u41db4dv{~EjgWkZ@iru42xSf}va;-Ml
zE4xp_Vy=YOj&tCp^+2#ZW0`1qZhm2}*tR~L#6SZxnaJW*r|y`Hg~!u_nZ<6rgG+L&
zJ<z2phCr29`Syg(hztXAfOSuN<$?A~CcN?hU+R-AKCc)y4C7hv7qBfm%+wnXEQ-%5
zW2QrNYauoTu(d$eL8jImVX+A8ga-T|6mkIpL`?!wM}?@rJ2O3Ys}n}0)gAqV$qgo|
zWViG`3>te%KeBAuj?<_5A1oGK3%fpdY>J)5CNtGK0$2*GbEVl8&ni@19d57=#9f=f
zwzEJCbi|Wq+F?mL1fA_8%WWG0(spQoy@ShCo?zLLm0q~Gh3YN&*hnw~K<JfvOZ5=M
z&c+NN?RCq>92lS2$$KE{9uzhNyuO2$zTWX^Vs-yP%hixja}*K>>n+Rrk}rRbvj-n2
zZgtQjCIffxz6{h91@Y++bPwBerCL2yC80z<$gO@6%Hp*ErQTjM+gKq%s@^3OdJU9$
zQPVgzYGMC?0*-jpqaZfRo=6Sdkuyuh9mxxME#y3wnw@(8ngL-BCsHF<mw9V)F3-#-
zPMR};gO|@nJ8=Z-vJE9?8)U=r%ZK7l9K^b8!%2rhHXO!UHuLCgN^+(_HXN3GMDE0~
zIkGeun8_8Guj|lF)0yuXcl<7U53=Ba&OCuddra(>KJ4SHbRhB}i^7s)!uizZj1FY8
z(sH{aya;Drn_}q<9>bmj5Q`3**tdaX>FfY3H5<F*_uBV@Ea&gxr6ix{8k4#C<~K2W
zL+W@EnW8BY-L)3aC(dh}*E(yp?bxWznAxp<#^Sg7&PBz?v-sn6u#-y7b!snrI&~yC
zydREk*)XzU=syeRPs)NcScrT7k)2wZ`&nZ@-z%ukv4Vb<kkgGv-3uoRkLhg?HXbA5
z5E9S&0Y6#v?#*zQ?s8i09$?xQmRH)Nz88Hjb0^k+zm-M%a$y_9N?*J290qd=i%r0@
zN7tdqyDWO#Wcg{2<*6Re)C2op_QY>Iu_!*}W+paL>RW8ah9TJhAluS)G43;>$1`p^
zsiG62_knbG{6;gJvdxulHMtnH)6S&Yx{s@C`grJ$ciNj_v2?2yxr^dieirZS+73+v
zaT|*-${1_gqS=nc=bM0)Pc@ryHK-Nc1<&ye%a*B76spvj3jyBulBYmODu*5A>rznD
zrtANJskBaB(C=2SamWsDCLJF8dKI2G;|pOU2+PsFtkiL!yyOT|7o-nh<s}Co4)*~!
z<#rc9-4Y>q?Klfwd`4EI+aSmbxr(1?^>Q=$T*zVzHL-&Ww5>0SJO3+)Bc~(iod0mE
zhv1OxR{uESfMkXPl6siy<5vHWI0@u>koXpdoYCXvo2R>dFTp4}g~h8+F?E}PQC^T_
zre_Bf8=sR(ha#WCls^s<QrAQ`tcuj}>!BnU$)`##5%w7;Y4<JK6i)7C(xU(_N%%(5
zoSuE=$i}eB#^{GzAvi2+voPvCayrej@4u6m$Y&?*n#<~j&9M5(XaigdZDf`QBQZM9
z0FmkKEeyW=1c!0~-wE4#(TJ{cecdg$y#T3}uIJ#bt<~|$6U?^<?9qKDV8j@)nReFu
zZWhbrTMkd$w*q!P<^;BL`eva*VZD2S!wt-0c#dD+O!jIcPZN;H5_7nK_k_hm>k`=M
zSZtnsc;W}JF2wItQIB&F9I?TJJy6=@3rCo|Uw3L@KCO2~KQ<*&wbD01@(kF1{qMHI
zw1f?H*SEsF(XG%8JWJQV!CT)7Q&9puS`)HStx)+DTfzU&TA?g~dofsJWGj3(-3ndr
zYpe5r%i1aJM_Id=S{oM}hW-7p<*Ac9j@iQ3a{i|3>=RDdz^el|cr8y=n@&JtWOyL!
zBE=amGqJ6fr>2-X_k`zh`eI*?+q8QVu(S+LNL4!b!1lv;XdtUdSTG7VGfv)JxLNl<
zsd;bku_v!+O7uf7ALkwzdaa;mGLeUk0tc9pT{0fJRBj1WmxsmpK&fYz)(>IR$|E<}
zBIod%Eo^TkfukfnY?Dty6gI*i;@+6kVQ<`QA9>fr8wj?uxMe>Zg+TbL?2n&?4f}4~
zu%qgbsrrwwKYjr=?5~RbF%VXK1JmGC3%4t{KYj*v|3d;nZMTa0j%-M{;_$?q>>jwL
zfB>e0Nw}Ewk=vtw%+0O!t+Z7>1FP^OfV4_(cM;?_u!`=)@U>e7RW-IsF3gWJ&?>o$
z;ig|$#4NMGd^x5?j)O(`OeHO1>xNB_lD<9RCY1eT#L3S8_4gB0RkKJwg!KIcwuI|v
zqeX0FSDC)ifgECd#;YoKz>#>w&{AE^aKS`ZmbKc=116+<RC=798#x=wYwyM}s=Fph
z{u;2+N6@dmL!qjg`9>cKUF|GhwRQ*I!Rr^n{PZGd<(&#P(IwixBYonL+yD5c`$5G1
zlVCcv!lh~0TIz^=Qd*f6`KW#{yGYqT4m^jp&C`C_(1vu`p~1|^Oy(P8jXmdf?f(uO
zS@P7Qy+_Y2*xl6QFC9#Y=Odq4DtckYEv<gybIV<L5}~5_#J7J^v=^=?dvc>k$AwEv
z+b7-pacTRM4eys$ns0u$w9>j^FJBOR&#?Fuw-g?mGsL^UtfaYrAe*Q<atAqj-vnzv
z+$nZ^2M%|)44%ExTfo$LaJ8-dBm&z>=3}`G)%vHGts`z)<4%+6DyxVEtj>6|-J~{o
znC}ImF7&`N`F$JL+Y=AvE%v=5(l3wA*~`;BVn^$YRZJ<5Z^8prxw^;!(MhVS+}YIY
zHw~+<IUuWKG2AZcrxG%cz^xdXE#`7mmd6&A6L$J~16p4jX0i2-Sd*tb<|}U;#2*2g
zUh<oU%1wLm9yik&t2ry$XHu&e^YuZS-wBt`tasRE8tOfg(R=LNf+1LrRTmV~REB3Z
z*m`F}Q%8Fon_LY8u12WA+v-QW*3ly-%iUVabpx&@1*iz{9C!Ck1woLfb->l8z&cXR
z^TB$d8BF-93yzlSo6|D!uupa)lVy)cZ?}1VtSfbJ!dGy~L?5@4)Td#Y29yyFIk-0+
zi#ryU=P33od5+omn&VeiMH-v^g+67mx4F5^XRG%%HV6DRZ`hU>x;W22K`AQ54DwO(
z{E90Q3b(Zcg430}P@bdIuT(|a+nXCxK(k+2ykOx{o(zQ)&JIDQypS!=KNB8cxB9*7
zl*Mh_tBwRh7+oIlw`!s4hDeY@3xbW9QsoUtf*82m+Zyo(*C`7EtAaeR*c<dUD9-j^
zV>74^gh*=uUd`(iXJmCC6jrJN?cqSnsz6Y=!WV7}Vqi(z8VxBA__TnlA=nrSw|GO!
zlEC%0l>u)sWGe@04Q<U>evwk)4TS?iiWSSm*)lLk1}>C=xiWAO)VKxAr7Ul31vyF;
z=-awlSpfD{E_W_b7F5hv7CV<J&dR0WuT(CtSpttoan>wUswyg#D_qsek}H-e<rS_G
z?f(BQG6@vUB}(JEjE{7<xZ<6qbhxU}IPT(T9Dfg?akDOt=UJMLmy<M(i#&~^E{Nfn
zL0zRo9Lq)=%aBg$k2o%p(jktz<9J11I>b>Y<fATM)gd39^tG5a<}v(j2?$^E@OqRw
z)Cb*K%}c!{|A#3=3rPNNbhzaI#Q^_Ahf6spqXCGMavVBb^6>)}>LkB~;$^p&X&i?b
z90x9g101@WQ+XG<To>E&YgR>C!x39iQQ@3I$E>0Vj~3ljTwLgwQy^fT1=|d753tP_
zE~!zVU-lp-+gSD)(+x`%(^{+P%t_hiJ3uD~aBv<*cN%u)`8n3j8P1&CjhW6ITQsXY
zrz|sWnvi1y$eClEpJR4T#BwSz130NLVt$S&$~57lY}syq!ul^5EqkR-HjcH&ME*JG
zvmC2`0+tF#YKZO@X>sh7An*2!`8l~;GUw;m;#u=^@;8rj<~TNvcjlBtv&(ayFl9EK
zG$E%1FlSCaaDWT-<CTEysBf-lmk+S+UnZ|C^K?@VbLLcLT%S`G$YB*)XlV+o8LKQl
ze{_g)RzNXIMUFL^k+C!f1}4_`UXgZ1I?at~Wf12(fAA*?`)$Gc#<9Mev!G5JvA)sq
z&`KH4P2jbJR-?7VHuzk~-za2^Y+py_MY=Y?JPx!6xK@Ka=j!rcJ^UiiCd>%!meHg&
zTnR)wMB1y^DZ$QhtjA{TV;iv^(Q)NDb(uS|CgkJ;1bt056ZWg$iM%KW9b!WQTf`A<
zPJBmTb%?FWvDSeRvCn)iaNfV6Y-1Z9&tc`zrfp!(O=Fr9C6DRQOj*j`PmGnAIby{~
zGcrfav=rxO|1<z0?Cu#v4zCMfxEnQr9JD_=G%q^5u|yngg$}wCcT?ewxQt65cC8%O
zSZO{UpRjbvlj#=;T*jq8j|WEU?-aN`Wr4u;Ym3&d;}ravj=D-78K<01K?v9IxL!*|
z(bf}l$__GHt>Ei(Xmi*&<pN0*{&n0R2VYr^bnODy>X%MiBXHS2Wy%JDFO`7szbEjk
zb@(QMYkH*<w+LL*D~;bJ@U}F>L;oUhO|Nv~{ScIWB#kP{Lk9R02KWvGe82#I1@Otx
z<77XU>&V{(ezzF2(;$xSLwJz?TY>ZS1hB*KjHUk<26#3M)Uo(h1N@r?IDSMpmYf*|
zc(DOqW`I`!j`i&n1DdaeP^T*l_%(vRUGVw(2>fde_)P|Q$N*nwfNwIuzi)v5!~lQ9
z0RN2vjvptXeJb$he{g*L2EWgM{|w+KE8~?O@X?hBe+Yb(Q*^S1uh2Z@b->4J-?t6$
z{X&i^<nXl|WPN79KQ8#M3O--ofj=Jl_gLlPcY$N!rvi@p|826SIA7x-c(wt5x&dA&
z<TQ$nB477G&Rl`Na;hd@n~z{`BJf)LvH=`l*TH87@~<$!mmA<d0~|lv#QI)@KRtny
zdx9GQ$M(ISqe{x(DEMuHFX!VQbAF~$Cio8FKPqq$<@;j{|6a)9KY2mC4nc6cIe)xT
z2JMM%t>*Fl$yn|6ijZ@L*mTSJ?=8T`;wOcCAvRUt!?IkFP$voUW@L4swYI&X4ek~i
zTN}fLvz1leP@u3_2?j!iMT$4%tMz-sfkKB;=yCB^35JUBRgACM_?m;S3-L7<U#0lE
z2wz1!OA)6RaeficP{eZ-@hn9=Pm#vgq?an8roi=zKNPABdRzT%ElO=|2yYg`^|ii+
zCZ!5e-K}9n!(Lw#Vzqd;qXgSm)dt%l;YO%W;0DM8_Z<PH!7oEC{2oYI)!16Q%I7au
z@Kx(?T%!eABF$RB>xZi&Oah`33i|`WAe2>4GzhG13@a^7Qf|oSZ4O|(EeK?;SJ2nK
zjw>4s)Ppe_6cAG%Qrc=kN?WioyskDB2rKQ4pnfO}7bjYkTy1cWLqTA9G`!MW8)|9u
z2ROO8ZFMoGg&SJ}%5{<EMsKUq($*So2qD_)4L7#6Dw-Ek{Hs>ih68K4uwY;{CTU(s
z!8Z4S22izNa{y}?Yz1vmh!3v66mM;?)vx$!TS9@=ykJNO!8Mqcv8}l+h*Yp`Sn**j
zq<B{Y5o*z18(!BQ2x(QT^|gkBJctE~8>ia(W_foMQW`_G&2aw&U2I(~GzqVZ0+A5L
zyko~}o4FdHfHwqH(!3B>LDWFAS{pP7H`DcjKog2@^|l1SOlU0;<|83bj0YGd{7wTt
zrQvQC!Ew(i@g)Ms<?~#EFB7<4&b0)ma&8g0UXDum=RjDN`+b5_Iok~6yhHeS-y`K5
zCwv>hCy99p^~WC+BtK8!sHctK3k~q)L=G;wQjWaW(93y}$eBv`uNmMbR1h4>q5R1N
zp9X2N+&2Y|9^P@_ISAc;!v7Y;CI6s~kG4ZMPF#>+xs*Ri;8H%OqdQZF%Q?G5hfDe8
zL_VIeq&|xb_)B%TEVq@&$tQ9m2K*@D7ZCn!2K;*n|9rx~&w&3Z;Zylf8u0m#hagwl
zQ`+ZQB8QgyvVoj834aFB^IZe}CkFiUg<(-2JY&jsDIz$wt;8=O_)LP&7dU!2w@AK+
z;DrQVN#xLS>j+NEZ6-J^_q#+sEq6V^X}KK)pGNe#o5*(%{62!?7FE{kVIs#)@Fxk5
zbFbvTN915r{6oycpAnqShsOy{=fi2^ATu~jrt@=&z-7OZ<z7zsc!weJs|bHK!8Z{8
z1qAOTe5&VWf>S+zN^m^0eGU)0pA($g`Duac>-D|?{}92sd|}UJ<F#_o4)`5Ns=fk8
z`Ev>GBYaw~CW6y?-9&KI|A;8}XGA`2m!}CI^^|v<{RF3W_%p$&K5r8Fc;_VRRf9jI
zfW!Jugh%25fn$B~Y#?#`DHxr;pR6S~)w6@(RL>s~`E;FnjPPl@ZzDLB^PGX4Hx2kn
zf>S-M;sOinOZCzIh;sCJi3>j30gsH*KEEJxXgfYh__SU4pRx#h(th$M!bd$Hg9qJ9
z1gCZwHYoQ1;nQ-D5S*5K7G98o(~l#&z_Gqm&P;++Ih6)-RuDdwQ%7(rCu$()9)i<(
z=@A2*e?<$r>-Fz5D0d%`L)-B<;p03k+x-+&8XVe<>N8#7SZ*milAllbbUrB}_;kWQ
z4F@MUz5eqEj{QXL2bK_=_LC}sqn;PTBlT$^eB1*{{2qeSe*P4}sr+7o)A6-Ohf6)*
z6S!WVgM^PksZWN;h<2cQo=0%1r^^7pPT*KB9q$_mpVsSEg4254sly)wdFXyFaJ~F)
z9beApy#%M_zG)!m0O3=8ju4#6ITa5r;Pm>OCvdbUm2&~XsT_}ioNEc6%Bd$fl@m3P
z<HSP|IMj0*JhI)}2tJSCcN^fJ3LNW&UoQ%?D3{@85uC)O{Ivw10_hUpEO6AvM)21N
zALUE_H?0r>hw`bMSpwI~xq<MhoCgR_<^0w_PL{a;)Yoev!7l@yuzYmO4e+%BM_w5`
zvR*$W{P}=OIfDj#GY&$*D+gc7Ig{WO1osFW^{4AXHNkOelyX{#96CR5Abh-PmHZzN
zzK!5}2u|mLw+KFs@GatijC$ggt(1Qn!SOdWiRTF%?PVkQB?Mnca7N@%J5&;!+F=#J
zv3+H^HxT*MKB|r{+wmuaUrglulJIG}?=;{K5I)WWa^L<u!5u{YTSShF;91z|!J(ZQ
zJhEOV2^{sN>t-S0({kq$oR)hD!KvM91dj4)e+Uqq_J_NPoLr*ky#)Uz!5<}ZYy|Ho
zIPDM56FIa$yi9P~ABG7|>!pa_9k5=sKUf5gc1FJJ4|c+*{b4rY)BaFl!1oaT9FYGQ
zJm{(kPTM6w<k0?bFX2-;4-%Zpd6eMP4*f(vZQmCOPTO}r4svj4mvVSyKm4J<(QY1s
z-%I$53H~s_FDLkuL=J7=odl=t`|m^!ZO4xYAMGR8I}=Vu;80I$pYa06deMF0WWuNW
zu4x3vAuH{$kl;9FO5A6FZx%R~OYQ#v;h#a|{DR>4)033nOXS!HzMJr=JzpX?wdX-1
zhuSj>2QxUd8@1=@1gG{qOW=BY<`X`(X9>ZnJ+C1+wdXnm{AU8!+w*sXPwn<1;oG2Y
z(w?sqKDFCEf>XO?;iLu*%cXWZP2hUF6%d@-t&qr}{qSPKr*^xX;M8ty1gCb38Q|Ll
zuD9Fsgir1EHsRZ#&eCoN34aN}4-@`ef@fpr28Z@RK9-NpDsYUK!z1ONLHM*?&LcQ&
z7Z;I3+ogu^X@71YIBk~}BIjbF&pN`V?V=K#w#%;wPTS>K1N;+#>)Yid@w+qH?Fyn#
zj=+&m?L3>{(+K}kBIi4RV?K0i2wnwoY5)5S@CONAP58Y8Uq<j_26(3U{TlUHPWa0S
zzLMaN2psKjCBdI2e7diGhVZW<{MQZm*=GX@9Lh(Ew8NPKM?Gx>zmV`TDEV#!{;vtX
zl<+?wcn!fnC3rc($KhZAhx$~)Bg>shaN6$3MQ4Hs^K`(2PF#bf@LM%F^$mgKZxOhd
z4paPl1a1?)<lisi`8xa&5y!EHPV%47qDq<YCEhLKxCcfj@m?*O`hHmA&kB5n@FhMZ
z;&nRwWf5=C;jfE$yAFR##MkQZcSO8Hhwm5hO*;HD5x+x+e<9*qbvS-+hVFj;PTK*$
zH$x}wC+mBLz<YRVnx8LlX%ETA|J(}QUj9z==L!6P4#)4xxD!5l*stVw>ryX?%in!%
zI)0sqJ9K!Xh?nVbDR+eq4~lrZ4#)4!(Bb!G=+NW+U&Qa%;kSzTt2%tMh#%15cZj$d
zCk${HMfrCL+@`~~3cN&z%X_d&9WM35|6UHAl>eZR)1kv37Wh^j{+Pg@(BV%A9KS!J
z^0x_muMXcXaGOTw^Q53~1lIy|h~Xo@OOoSD;_|!94M+wjars@Q?9URH-!J}Aqo?@t
zJH@kvUXm}r=j+tr^7}k#SIL*(+x<?*m*3U#0*RjTJGnV1fZ+1GH`!iNzWna3Q^%Lz
zz4hpD`CZyybh!NfY*>fO@5$sJYG5)x^7}BQFtn~E>|F(ZIH>szG87CndkgueK!xye
zj8`dK6$&YZ!8VK_y$C+^D&!=NK&TM@SkBv=;`l;g_%2IifCzrE<!ggK`+`rE3R~OY
z^Dy{Rl3DyyED>Mb8YyfKwzUU>@RgA!JJ3*DAB2x^(3<~qX9E$~l;UT<l)y3(a)@<O
zCWvO2Q3bavIF8Fwh!(KT*8JUK|A5z^nAfbqsqg2%C2;(Wgt`t<ZaW1bY}3VOLL6o4
z^B)lTMe(UzvU~{?K>+nCmVoeOe%a4uvaG-KaSw%Iefd2$Eyoej|Fqa>{mUR>Ec*@P
z<`rBeCW1rzVH-*P<^BTqKv+KRnW@`8U5iPcOq2PgzX}q@%0DFXTV+P!%eeHheK5a6
z$J;CNWBX8tTy)Z3N5VWoye8RcEFG5lB%(qf8y=mR6|2Z^lSJXmdQ13UA%HUV`5huZ
z(>1I<|GkhGo2U#P>Po}}FV_uhKl~{}=9ld<R@o?L?RofyeT>L}_Qo!)&yPPd>Qn>-
g{?17>NTx1PZ$G@2*HL)>VeDAo9@V_mTc7_w0f-_I?EnA(

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/config.h b/data/ssb/dbgen/config.h
new file mode 100644
index 0000000..fa505ec
--- /dev/null
+++ b/data/ssb/dbgen/config.h
@@ -0,0 +1,179 @@
+/* 
+ * Sccsid:     @(#)config.h	2.1.8.2
+ * 
+ * this file allows the compilation of DBGEN to be tailored to specific
+ * architectures and operating systems. Some options are grouped 
+ * together to allow easier compilation on a given vendor's hardware.
+ * 
+ * The following #defines will effect the code:
+ *   TPCH              -- make will create TPCH (set in makefile)
+ *   TPCR              -- make will create TPCR (set in makefile)
+ *   KILL(pid)         -- how to terminate a process in a parallel load
+ *   SPAWN             -- name of system call to clone an existing process
+ *   SET_HANDLER(proc) -- name of routine to handle signals in parallel load
+ *   WAIT(res, pid)    -- how to await the termination of a child
+ *   SEPARATOR         -- character used to separate fields in flat files
+ *   DBNAME            -- default name of database to be loaded
+ *   STDLIB_HAS_GETOPT -- to prevent confilcts with gloabal getopt() 
+ *   MDY_DATE          -- generate dates as MM-DD-YY
+ *   WIN32             -- support for WindowsNT
+ *   SUPPORT_64BITS    -- compiler defines a 64 bit datatype
+ *   DSS_HUGE          -- 64 bit data type
+ *   HUGE_FORMAT       -- printf string for 64 bit data type
+ *   HUGE_COUNT        -- number of objects in DSS_HUGE
+ *   EOL_HANDLING      -- flat files don't need final column separator
+ *
+ *   OS defines
+ *   ==========
+ *   ATT        -- getopt() handling
+ *   DIGITAL    -- changes for DigUnix 64-bit support
+ *   DOS        -- disable all multi-user functionality/dependency
+ *   HP         -- posix source inclusion differences
+ *   IBM        -- posix source inclusion differences
+ *   ICL        -- getopt() handling
+ *   MVS        -- special handling of varchar format
+ *   SGI        -- getopt() handling
+ *   SUN        -- getopt() handling
+ *   LINUX      -- getopt() handling
+ *   TANDEM     -- EOL handling
+ *   U2200      -- death of parent kills children automatically
+ *   VMS        -- signal/fork handing differences
+ *
+ *   Database defines
+ *   ================
+ *   DB2        -- use DB2 dialect in QGEN
+ *   INFORMIX   -- use Informix dialect in QGEN
+ *   SQLSERVER  -- use SQLSERVER dialect in QGEN
+ *   SYBASE     -- use Sybase dialect in QGEN
+ *   TDAT       -- use Teradata dialect in QGEN
+ */
+
+#ifdef DOS
+#define DSS_PROC        1
+#define PATH_SEP	'\\'
+#else
+
+
+#ifdef ATT
+#define STDLIB_HAS_GETOPT
+#ifdef SQLSERVER
+#define WIN32
+#else
+/* the 64 bit defines are for the Metaware compiler */
+#define SUPPORT_64BITS
+#define DSS_HUGE long long
+#define HUGE_COUNT	1
+#define HUGE_FORMAT "%LLd"
+#endif /* SQLSERVER or MP/RAS */
+#endif /* ATT */
+
+#ifdef DIGITAL
+#define DOUBLE_CAST	(double)(int)
+#endif
+
+#ifdef HP
+#define _INCLUDE_POSIX_SOURCE
+#define STDLIB_HAS_GETOPT
+#endif /* HP */
+
+#ifdef IBM
+#define _POSIX_SOURCE
+/*
+ * if the C compiler is 3.1 or later, then uncomment the
+ * lines for 64 bit seed generation
+ */
+/* #define SUPPORT_64BITS*/ 
+/* #define DSS_HUGE long long*/ 
+/* #define HUGE_COUNT	1 */
+#define STDLIB_HAS_GETOPT
+#endif /* IBM */
+
+#ifdef ICL
+#define STDLIB_HAS_GETOPT
+#endif /* ICL */
+
+#ifdef SUN
+#define STDLIB_HAS_GETOPT
+#endif /* SUN */
+
+#ifdef LINUX
+#define STDLIB_HAS_GETOPT
+#endif /* LINUX */
+
+#ifdef SGI
+#define STDLIB_HAS_GETOPT
+#define SUPPORT_64BITS
+#define DSS_HUGE __uint64_t
+#define HUGE_COUNT	1
+#endif /* SGI */
+
+#ifdef TANDEM
+#define EOL_HANDLING
+#endif /* TANDEM */
+
+#ifdef VMS
+#define SPAWN   vfork
+#define KILL(pid) kill(SIGQUIT, pid)
+#define SET_HANDLER(proc) signal(SIGQUIT, proc)
+#define WAIT(res, pid) wait(res)
+#define SIGS_DEFINED
+#endif /* VMS */
+
+#if (defined(WIN32)&&!defined(_POSIX_))
+#define pid_t int
+#define SET_HANDLER(proc) signal(SIGINT, proc)
+#define KILL(pid) \
+     TerminateProcess(OpenProcess(PROCESS_TERMINATE,FALSE,pid),3)
+#if (defined (__WATCOMC__))
+#define SPAWN()   spawnv(P_NOWAIT, spawn_args[0], spawn_args)
+#define WAIT(res, pid) cwait(res, pid, WAIT_CHILD)
+#else
+#define SPAWN()   _spawnv(_P_NOWAIT, spawn_args[0], spawn_args)
+#define WAIT(res, pid) _cwait(res, pid, _WAIT_CHILD)
+#define getpid          _getpid
+#endif /* WATCOMC */
+#define SIGS_DEFINED
+#define PATH_SEP	'\\'
+#ifndef TEST_32B
+#define SUPPORT_64BITS
+#define DSS_HUGE __int64
+#define HUGE_COUNT	1
+#define HUGE_FORMAT "%I64d"
+#endif /* TEST_32B */
+/* need to define process termination codes to match UNIX */
+/* these are copied from Linux/GNU and need to be verified as part of a rework of */
+/* process handling under NT (29 Apr 98) */
+#define WIFEXITED(s)	((s & 0xFF) == 0)
+#define WIFSIGNALED(s)	(((unsigned int)((status)-1) & 0xFFFF) < 0xFF)	
+#define WIFSTOPPED(s)	(((s) & 0xff) == 0x7f)
+#define WTERMSIG(s)		((s) & 0x7f)
+#define WSTOPSIG(s)		(((s) & 0xff00) >> 8)
+#endif /* WIN32 */
+
+#ifndef SIGS_DEFINED
+#define KILL(pid) kill(SIGUSR1, pid)
+#define SET_HANDLER(proc) signal(SIGUSR1, proc)
+#define SPAWN   fork
+#define WAIT(res, pid) wait(res)
+#endif /* DEFAULT */
+
+#define DSS_PROC        getpid()
+#endif /* DOS */
+
+#ifndef DBNAME
+#define DBNAME "dss"
+#endif /* DBNAME */
+
+#ifndef PATH_SEP
+#define PATH_SEP '/'
+#endif /* PATH_SEP */
+
+#ifndef DSS_HUGE
+#define DSS_HUGE	long
+#define HUGE_COUNT	2
+#endif
+
+#ifndef DOUBLE_CAST
+#define DOUBLE_CAST (double)
+#endif /* DOUBLE_CAST */
+
diff --git a/data/ssb/dbgen/dists.dss b/data/ssb/dbgen/dists.dss
new file mode 100644
index 0000000..72157ef
--- /dev/null
+++ b/data/ssb/dbgen/dists.dss
@@ -0,0 +1,817 @@
+#  Sccsid:     @(#)dists.dss	2.1.8.1
+#
+# distributions have the following format:
+#
+# <token> | <weight> # comment
+#
+# Distributions are used to bias the selection of a token 
+# based on its associated weight. The list of tokens and values 
+# between the keywords BEGIN and END define the distribution named after
+# the BEGIN. A uniformly random value from [0, sum(weights)]
+# will be chosen and the first token whose cumulative weight is greater than
+# or equal to the result will be returned. In essence, the weights for each
+# token represent its relative weight within a distribution.
+#
+# one special token is defined: count (number of data points in the 
+#  distribution). It MUST be defined for each named distribution.
+#-----------------------------------------------------------------------
+# currently defined distributions and their use:
+#  NAME       FIELD/NOTES
+#  ========   ==============
+#  category   parts.category
+#  container  parts.container
+#  instruct   shipping instructions
+#  msegmnt    market segment
+#  names      parts.name
+#  nations    must be ordered along with regions
+#  nations2   stand alone nations set for use with qgen
+#  o_prio     order priority
+#  regions    must be ordered along with nations
+#  rflag      lineitems.returnflag
+#  types      parts.type
+#  colors     embedded string creation; CANNOT BE USED FOR pick_str(), agg_str() perturbs order
+#  articles   comment generation 
+#  nouns      
+#  verbs      
+#  adverbs    
+#  auxillaries 
+#  prepositions
+#  terminators
+#  grammar    sentence formation
+#  np
+#  vp
+###
+# category
+###
+BEGIN category
+COUNT|5
+FURNITURE|1
+STORAGE EQUIP|1
+TOOLS|1
+MACHINE TOOLS|1
+OTHER|1
+END category
+###
+# container
+###
+begin p_cntr
+count|40
+SM CASE|1
+SM BOX|1
+SM BAG|1
+SM JAR|1
+SM PACK|1
+SM PKG|1
+SM CAN|1
+SM DRUM|1
+LG CASE|1
+LG BOX|1
+LG BAG|1
+LG JAR|1
+LG PACK|1
+LG PKG|1
+LG CAN|1
+LG DRUM|1
+MED CASE|1
+MED BOX|1
+MED BAG|1
+MED JAR|1
+MED PACK|1
+MED PKG|1
+MED CAN|1
+MED DRUM|1
+JUMBO CASE|1
+JUMBO BOX|1
+JUMBO BAG|1
+JUMBO JAR|1
+JUMBO PACK|1
+JUMBO PKG|1
+JUMBO CAN|1
+JUMBO DRUM|1
+WRAP CASE|1
+WRAP BOX|1
+WRAP BAG|1
+WRAP JAR|1
+WRAP PACK|1
+WRAP PKG|1
+WRAP CAN|1
+WRAP DRUM|1
+end p_cntr
+###
+# instruct
+###
+begin instruct
+count|4
+DELIVER IN PERSON|1
+COLLECT COD|1
+TAKE BACK RETURN|1
+NONE|1
+end instruct
+###
+# msegmnt
+###
+begin msegmnt
+count|5
+AUTOMOBILE|1
+BUILDING|1
+FURNITURE|1
+HOUSEHOLD|1
+MACHINERY|1
+end msegmnt
+###
+# names
+###
+begin p_names
+COUNT|4
+CLEANER|1
+SOAP|1
+DETERGENT|1
+EXTRA|1
+end p_names
+###
+# nations
+# NOTE: this is a special case; the weights here are adjustments to
+#       map correctly into the regions table, and are *NOT* cummulative
+#       values to mimic a distribution
+###
+begin nations
+count|25
+ALGERIA|0
+ARGENTINA|1
+BRAZIL|0
+CANADA|0
+EGYPT|3
+ETHIOPIA|-4
+FRANCE|3
+GERMANY|0
+INDIA|-1
+INDONESIA|0
+IRAN|2
+IRAQ|0
+JAPAN|-2
+JORDAN|2
+KENYA|-4
+MOROCCO|0
+MOZAMBIQUE|0
+PERU|1
+CHINA|1
+ROMANIA|1
+SAUDI ARABIA|1
+VIETNAM|-2
+RUSSIA|1
+UNITED KINGDOM|0
+UNITED STATES|-2
+end nations
+###
+# nations2
+###
+begin nations2
+count|25
+ALGERIA|1
+ARGENTINA|1
+BRAZIL|1
+CANADA|1
+EGYPT|1
+ETHIOPIA|1
+FRANCE|1
+GERMANY|1
+INDIA|1
+INDONESIA|1
+IRAN|1
+IRAQ|1
+JAPAN|1
+JORDAN|1
+KENYA|1
+MOROCCO|1
+MOZAMBIQUE|1
+PERU|1
+CHINA|1
+ROMANIA|1
+SAUDI ARABIA|1
+VIETNAM|1
+RUSSIA|1
+UNITED KINGDOM|1
+UNITED STATES|1
+end nations2
+###
+# regions
+###
+begin regions
+count|5
+AFRICA|1
+AMERICA|1
+ASIA|1
+EUROPE|1
+MIDDLE EAST|1
+end regions
+###
+# o_prio
+###
+begin o_oprio
+count|5
+1-URGENT|1
+2-HIGH|1
+3-MEDIUM|1
+4-NOT SPECIFIED|1
+5-LOW|1
+end o_oprio
+###
+# rflag
+###
+begin rflag
+count|2
+R|1
+A|1
+end rflag
+###
+# smode
+###
+begin smode
+count|7
+REG AIR|1
+AIR|1
+RAIL|1
+TRUCK|1
+MAIL|1
+FOB|1
+SHIP|1
+end smode
+###
+# types
+###
+begin p_types
+COUNT|150
+STANDARD ANODIZED TIN|1
+STANDARD ANODIZED NICKEL|1
+STANDARD ANODIZED BRASS|1
+STANDARD ANODIZED STEEL|1
+STANDARD ANODIZED COPPER|1
+STANDARD BURNISHED TIN|1
+STANDARD BURNISHED NICKEL|1
+STANDARD BURNISHED BRASS|1
+STANDARD BURNISHED STEEL|1
+STANDARD BURNISHED COPPER|1
+STANDARD PLATED TIN|1
+STANDARD PLATED NICKEL|1
+STANDARD PLATED BRASS|1
+STANDARD PLATED STEEL|1
+STANDARD PLATED COPPER|1
+STANDARD POLISHED TIN|1
+STANDARD POLISHED NICKEL|1
+STANDARD POLISHED BRASS|1
+STANDARD POLISHED STEEL|1
+STANDARD POLISHED COPPER|1
+STANDARD BRUSHED TIN|1
+STANDARD BRUSHED NICKEL|1
+STANDARD BRUSHED BRASS|1
+STANDARD BRUSHED STEEL|1
+STANDARD BRUSHED COPPER|1
+SMALL ANODIZED TIN|1
+SMALL ANODIZED NICKEL|1
+SMALL ANODIZED BRASS|1
+SMALL ANODIZED STEEL|1
+SMALL ANODIZED COPPER|1
+SMALL BURNISHED TIN|1
+SMALL BURNISHED NICKEL|1
+SMALL BURNISHED BRASS|1
+SMALL BURNISHED STEEL|1
+SMALL BURNISHED COPPER|1
+SMALL PLATED TIN|1
+SMALL PLATED NICKEL|1
+SMALL PLATED BRASS|1
+SMALL PLATED STEEL|1
+SMALL PLATED COPPER|1
+SMALL POLISHED TIN|1
+SMALL POLISHED NICKEL|1
+SMALL POLISHED BRASS|1
+SMALL POLISHED STEEL|1
+SMALL POLISHED COPPER|1
+SMALL BRUSHED TIN|1
+SMALL BRUSHED NICKEL|1
+SMALL BRUSHED BRASS|1
+SMALL BRUSHED STEEL|1
+SMALL BRUSHED COPPER|1
+MEDIUM ANODIZED TIN|1
+MEDIUM ANODIZED NICKEL|1
+MEDIUM ANODIZED BRASS|1
+MEDIUM ANODIZED STEEL|1
+MEDIUM ANODIZED COPPER|1
+MEDIUM BURNISHED TIN|1
+MEDIUM BURNISHED NICKEL|1
+MEDIUM BURNISHED BRASS|1
+MEDIUM BURNISHED STEEL|1
+MEDIUM BURNISHED COPPER|1
+MEDIUM PLATED TIN|1
+MEDIUM PLATED NICKEL|1
+MEDIUM PLATED BRASS|1
+MEDIUM PLATED STEEL|1
+MEDIUM PLATED COPPER|1
+MEDIUM POLISHED TIN|1
+MEDIUM POLISHED NICKEL|1
+MEDIUM POLISHED BRASS|1
+MEDIUM POLISHED STEEL|1
+MEDIUM POLISHED COPPER|1
+MEDIUM BRUSHED TIN|1
+MEDIUM BRUSHED NICKEL|1
+MEDIUM BRUSHED BRASS|1
+MEDIUM BRUSHED STEEL|1
+MEDIUM BRUSHED COPPER|1
+LARGE ANODIZED TIN|1
+LARGE ANODIZED NICKEL|1
+LARGE ANODIZED BRASS|1
+LARGE ANODIZED STEEL|1
+LARGE ANODIZED COPPER|1
+LARGE BURNISHED TIN|1
+LARGE BURNISHED NICKEL|1
+LARGE BURNISHED BRASS|1
+LARGE BURNISHED STEEL|1
+LARGE BURNISHED COPPER|1
+LARGE PLATED TIN|1
+LARGE PLATED NICKEL|1
+LARGE PLATED BRASS|1
+LARGE PLATED STEEL|1
+LARGE PLATED COPPER|1
+LARGE POLISHED TIN|1
+LARGE POLISHED NICKEL|1
+LARGE POLISHED BRASS|1
+LARGE POLISHED STEEL|1
+LARGE POLISHED COPPER|1
+LARGE BRUSHED TIN|1
+LARGE BRUSHED NICKEL|1
+LARGE BRUSHED BRASS|1
+LARGE BRUSHED STEEL|1
+LARGE BRUSHED COPPER|1
+ECONOMY ANODIZED TIN|1
+ECONOMY ANODIZED NICKEL|1
+ECONOMY ANODIZED BRASS|1
+ECONOMY ANODIZED STEEL|1
+ECONOMY ANODIZED COPPER|1
+ECONOMY BURNISHED TIN|1
+ECONOMY BURNISHED NICKEL|1
+ECONOMY BURNISHED BRASS|1
+ECONOMY BURNISHED STEEL|1
+ECONOMY BURNISHED COPPER|1
+ECONOMY PLATED TIN|1
+ECONOMY PLATED NICKEL|1
+ECONOMY PLATED BRASS|1
+ECONOMY PLATED STEEL|1
+ECONOMY PLATED COPPER|1
+ECONOMY POLISHED TIN|1
+ECONOMY POLISHED NICKEL|1
+ECONOMY POLISHED BRASS|1
+ECONOMY POLISHED STEEL|1
+ECONOMY POLISHED COPPER|1
+ECONOMY BRUSHED TIN|1
+ECONOMY BRUSHED NICKEL|1
+ECONOMY BRUSHED BRASS|1
+ECONOMY BRUSHED STEEL|1
+ECONOMY BRUSHED COPPER|1
+PROMO ANODIZED TIN|1
+PROMO ANODIZED NICKEL|1
+PROMO ANODIZED BRASS|1
+PROMO ANODIZED STEEL|1
+PROMO ANODIZED COPPER|1
+PROMO BURNISHED TIN|1
+PROMO BURNISHED NICKEL|1
+PROMO BURNISHED BRASS|1
+PROMO BURNISHED STEEL|1
+PROMO BURNISHED COPPER|1
+PROMO PLATED TIN|1
+PROMO PLATED NICKEL|1
+PROMO PLATED BRASS|1
+PROMO PLATED STEEL|1
+PROMO PLATED COPPER|1
+PROMO POLISHED TIN|1
+PROMO POLISHED NICKEL|1
+PROMO POLISHED BRASS|1
+PROMO POLISHED STEEL|1
+PROMO POLISHED COPPER|1
+PROMO BRUSHED TIN|1
+PROMO BRUSHED NICKEL|1
+PROMO BRUSHED BRASS|1
+PROMO BRUSHED STEEL|1
+PROMO BRUSHED COPPER|1
+end p_types
+###
+# colors
+# NOTE: This distribution CANNOT be used by pick_str(), since agg_str() perturbs its order
+###
+begin colors
+COUNT|92
+almond|1
+antique|1
+aquamarine|1
+azure|1
+beige|1
+bisque|1
+black|1
+blanched|1
+blue|1
+blush|1
+brown|1
+burlywood|1
+burnished|1
+chartreuse|1
+chiffon|1
+chocolate|1
+coral|1
+cornflower|1
+cornsilk|1
+cream|1
+cyan|1
+dark|1
+deep|1
+dim|1
+dodger|1
+drab|1
+firebrick|1
+floral|1
+forest|1
+frosted|1
+gainsboro|1
+ghost|1
+goldenrod|1
+green|1
+grey|1
+honeydew|1
+hot|1
+indian|1
+ivory|1
+khaki|1
+lace|1
+lavender|1
+lawn|1
+lemon|1
+light|1
+lime|1
+linen|1
+magenta|1
+maroon|1
+medium|1
+metallic|1
+midnight|1
+mint|1
+misty|1
+moccasin|1
+navajo|1
+navy|1
+olive|1
+orange|1
+orchid|1
+pale|1
+papaya|1
+peach|1
+peru|1
+pink|1
+plum|1
+powder|1
+puff|1
+purple|1
+red|1
+rose|1
+rosy|1
+royal|1
+saddle|1
+salmon|1
+sandy|1
+seashell|1
+sienna|1
+sky|1
+slate|1
+smoke|1
+snow|1
+spring|1
+steel|1
+tan|1
+thistle|1
+tomato|1
+turquoise|1
+violet|1
+wheat|1
+white|1
+yellow|1
+end colors
+################
+################
+## psuedo text distributions
+################
+################
+###
+# nouns
+###
+BEGIN nouns
+COUNT|45
+packages|40
+requests|40
+accounts|40
+deposits|40
+foxes|20
+ideas|20
+theodolites|20
+pinto beans|20
+instructions|20
+dependencies|10
+excuses|10
+platelets|10
+asymptotes|10
+courts|5
+dolphins|5
+multipliers|1
+sauternes|1
+warthogs|1
+frets|1
+dinos|1
+attainments|1
+somas|1
+Tiresias|1
+patterns|1
+forges|1
+braids|1
+frays|1
+warhorses|1
+dugouts|1
+notornis|1
+epitaphs|1
+pearls|1
+tithes|1
+waters|1
+orbits|1
+gifts|1
+sheaves|1
+depths|1
+sentiments|1
+decoys|1
+realms|1
+pains|1
+grouches|1
+escapades|1
+hockey players|1
+END nouns
+###
+# verbs
+###
+BEGIN verbs
+COUNT|40
+sleep|20
+wake|20
+are|20
+cajole|20
+haggle|20
+nag|10
+use|10
+boost|10
+affix|5
+detect|5
+integrate|5
+maintain|1
+nod|1
+was|1
+lose|1
+sublate|1
+solve|1
+thrash|1
+promise|1
+engage|1
+hinder|1
+print|1
+x-ray|1
+breach|1
+eat|1
+grow|1
+impress|1
+mold|1
+poach|1
+serve|1
+run|1
+dazzle|1
+snooze|1
+doze|1
+unwind|1
+kindle|1
+play|1
+hang|1
+believe|1
+doubt|1
+END verbs
+###
+# adverbs
+##
+BEGIN adverbs
+COUNT|28
+sometimes|1
+always|1
+never|1
+furiously|50
+slyly|50
+carefully|50
+blithely|40
+quickly|30
+fluffily|20
+slowly|1
+quietly|1
+ruthlessly|1
+thinly|1
+closely|1
+doggedly|1
+daringly|1
+bravely|1
+stealthily|1
+permanently|1
+enticingly|1
+idly|1
+busily|1
+regularly|1
+finally|1
+ironically|1
+evenly|1
+boldly|1
+silently|1
+END adverbs
+###
+# articles
+##
+BEGIN articles
+COUNT|3
+the|50
+a|20
+an|5
+END articles
+###
+# prepositions
+##
+BEGIN prepositions
+COUNT|47
+about|50
+above|50
+according to|50
+across|50
+after|50
+against|40
+along|40
+alongside of|30
+among|30
+around|20
+at|10
+atop|1
+before|1
+behind|1
+beneath|1
+beside|1
+besides|1
+between|1
+beyond|1
+by|1
+despite|1
+during|1
+except|1
+for|1
+from|1
+in place of|1
+inside|1
+instead of|1
+into|1
+near|1
+of|1
+on|1
+outside|1
+over|1 
+past|1
+since|1
+through|1
+throughout|1
+to|1
+toward|1
+under|1
+until|1
+up|1 
+upon|1
+whithout|1
+with|1
+within|1
+END prepositions
+###
+# auxillaries
+##
+BEGIN auxillaries
+COUNT|18
+do|1
+may|1
+might|1
+shall|1
+will|1
+would|1
+can|1
+could|1
+should|1
+ought to|1
+must|1
+will have to|1
+shall have to|1
+could have to|1
+should have to|1
+must have to|1
+need to|1
+try to|1
+END auxiallaries
+###
+# terminators
+##
+BEGIN terminators
+COUNT|6
+.|50
+;|1
+:|1
+?|1
+!|1
+--|1
+END terminators
+###
+# adjectives
+##
+BEGIN adjectives
+COUNT|29
+special|20
+pending|20
+unusual|20
+express|20
+furious|1
+sly|1
+careful|1
+blithe|1
+quick|1
+fluffy|1
+slow|1
+quiet|1
+ruthless|1
+thin|1
+close|1
+dogged|1
+daring|1
+brave|1
+stealthy|1
+permanent|1
+enticing|1
+idle|1
+busy|1
+regular|50
+final|40
+ironic|40
+even|30
+bold|20
+silent|10
+END adjectives
+###
+# grammar
+# first level grammar. N=noun phrase, V=verb phrase,
+# P=prepositional phrase, T=setence termination
+##
+BEGIN grammar
+COUNT|5
+N V T|3
+N V P T|3
+N V N T|3
+N P V N T|1
+N P V P T|1
+END grammar
+###
+# NP
+# second level grammar. Noun phrases. N=noun, A=article, 
+# J=adjective, D=adverb
+##
+BEGIN np
+COUNT|4
+N|10
+J N|20
+J, J N|10
+D J N|50
+END np
+###
+# VP
+# second level grammar. Verb phrases. V=verb, X=auxiallary, 
+# D=adverb
+##
+BEGIN vp
+COUNT|4
+V|30
+X V|1
+V D|40
+X V D|1
+END vp
+###
+# Q13
+# Substitution parameters for Q13 
+##
+BEGIN Q13a
+COUNT|4
+special|20
+pending|20
+unusual|20
+express|20
+END Q13a
+BEGIN Q13b
+COUNT|4
+packages|40
+requests|40
+accounts|40
+deposits|40
+END Q13b
diff --git a/data/ssb/dbgen/driver.c b/data/ssb/dbgen/driver.c
new file mode 100644
index 0000000..4f4c903
--- /dev/null
+++ b/data/ssb/dbgen/driver.c
@@ -0,0 +1,1144 @@
+/* @(#)driver.c	2.1.8.4 */
+/* main driver for dss banchmark */
+
+#define DECLARER				/* EXTERN references get defined here */
+#define NO_FUNC (int (*) ()) NULL	/* to clean up tdefs */
+#define NO_LFUNC (long (*) ()) NULL		/* to clean up tdefs */
+
+#include "config.h"
+#include <stdlib.h>
+#if (defined(_POSIX_)||!defined(WIN32))		/* Change for Windows NT */
+#ifndef DOS
+#include <unistd.h>
+#include <sys/wait.h>
+#endif
+
+#endif /* WIN32 */
+#include <stdio.h>				/* */
+#include <limits.h>
+#include <math.h>
+#include <ctype.h>
+#include <signal.h>
+#include <string.h>
+#include <errno.h>
+#ifdef HP
+#include <strings.h>
+#endif
+#if (defined(WIN32)&&!defined(_POSIX_))
+#include <process.h>
+#pragma warning(disable:4201)
+#pragma warning(disable:4214)
+#pragma warning(disable:4514)
+#define WIN32_LEAN_AND_MEAN
+#define NOATOM
+#define NOGDICAPMASKS
+#define NOMETAFILE
+#define NOMINMAX
+#define NOMSG
+#define NOOPENFILE
+#define NORASTEROPS
+#define NOSCROLL
+#define NOSOUND
+#define NOSYSMETRICS
+#define NOTEXTMETRIC
+#define NOWH
+#define NOCOMM
+#define NOKANJI
+#define NOMCX
+
+#include "windows.h"
+
+#pragma warning(default:4201)
+#pragma warning(default:4214)
+#endif
+
+#include "dss.h"
+#include "dsstypes.h"
+#include "bcd2.h"
+
+/*
+* Function prototypes
+*/
+void	usage (void);
+int		prep_direct (char *);
+int		close_direct (void);
+void	kill_load (void);
+int		pload (int tbl);
+void	gen_tbl (int tnum, long start, long count, long upd_num);
+int		pr_drange (int tbl, long min, long cnt, long num);
+int		set_files (int t, int pload);
+int		partial (int, int);
+
+
+extern int optind, opterr;
+extern char *optarg;
+long rowcnt = 0, minrow = 0, upd_num = 0;
+double flt_scale;
+#if (defined(WIN32)&&!defined(_POSIX_))
+char *spawn_args[25];
+#endif
+
+
+/*
+* general table descriptions. See dss.h for details on structure
+* NOTE: tables with no scaling info are scaled according to
+* another table
+*
+*
+* the following is based on the tdef structure defined in dss.h as:
+* typedef struct
+* {
+* char     *name;            -- name of the table; 
+*                               flat file output in <name>.tbl
+* long      base;            -- base scale rowcount of table; 
+*                               0 if derived
+* int       (*header) ();    -- function to prep output
+* int       (*loader[2]) (); -- functions to present output
+* long      (*gen_seed) ();  -- functions to seed the RNG
+* int       (*verify) ();    -- function to verfiy the data set without building it
+* int       child;           -- non-zero if there is an associated detail table
+* unsigned long vtotal;      -- "checksum" total 
+* }         tdef;
+*
+*/
+
+/*
+* flat file print functions; used with -F(lat) option
+*/
+#ifdef SSBM
+int pr_cust (customer_t * c, int mode);
+int pr_part (part_t * p, int mode);
+int pr_supp (supplier_t * s, int mode);
+int pr_line (order_t * o, int mode);
+#else
+int pr_cust (customer_t * c, int mode);
+int pr_line (order_t * o, int mode);
+int pr_order (order_t * o, int mode);
+int pr_part (part_t * p, int mode);
+int pr_psupp (part_t * p, int mode);
+int pr_supp (supplier_t * s, int mode);
+int pr_order_line (order_t * o, int mode);
+int pr_part_psupp (part_t * p, int mode);
+int pr_nation (code_t * c, int mode);
+int pr_region (code_t * c, int mode);
+#endif
+
+/*
+* inline load functions; used with -D(irect) option
+*/
+#ifdef SSBM
+int ld_cust (customer_t * c, int mode);
+int ld_part (part_t * p, int mode);
+int ld_supp (supplier_t * s, int mode);
+
+/*todo: get rid of ld_order*/
+int ld_line (order_t * o, int mode);
+int ld_order (order_t * o, int mode);
+
+#else
+int ld_cust (customer_t * c, int mode);
+int ld_line (order_t * o, int mode);
+int ld_order (order_t * o, int mode);
+int ld_part (part_t * p, int mode);
+int ld_psupp (part_t * p, int mode);
+int ld_supp (supplier_t * s, int mode);
+int ld_order_line (order_t * o, int mode);
+int ld_part_psupp (part_t * p, int mode);
+int ld_nation (code_t * c, int mode);
+int ld_region (code_t * c, int mode);
+#endif
+
+/*
+* seed generation functions; used with '-O s' option
+*/
+#ifdef SSBM
+long sd_cust (int child, long skip_count);
+long sd_part (int child, long skip_count);
+long sd_supp (int child, long skip_count);
+
+long sd_line (int child, long skip_count);
+long sd_order (int child, long skip_count);
+
+#else
+long sd_cust (int child, long skip_count);
+long sd_line (int child, long skip_count);
+long sd_order (int child, long skip_count);
+long sd_part (int child, long skip_count);
+long sd_psupp (int child, long skip_count);
+long sd_supp (int child, long skip_count);
+long sd_order_line (int child, long skip_count);
+long sd_part_psupp (int child, long skip_count);
+#endif
+
+/*
+* header output functions); used with -h(eader) option
+*/
+#ifdef SSBM
+int hd_cust (FILE * f);
+int hd_part (FILE * f);
+int hd_supp (FILE * f);
+int hd_line (FILE * f);
+
+#else
+int hd_cust (FILE * f);
+int hd_line (FILE * f);
+int hd_order (FILE * f);
+int hd_part (FILE * f);
+int hd_psupp (FILE * f);
+int hd_supp (FILE * f);
+int hd_order_line (FILE * f);
+int hd_part_psupp (FILE * f);
+int hd_nation (FILE * f);
+int hd_region (FILE * f);
+#endif
+
+/*
+* data verfication functions; used with -O v option
+*/
+#ifdef SSBM
+int vrf_cust (customer_t * c, int mode);
+int vrf_part (part_t * p, int mode);
+int vrf_supp (supplier_t * s, int mode);
+int vrf_line (order_t * o, int mode);
+int vrf_order (order_t * o, int mode);
+int vrf_date (date_t,int mode);
+#else
+int vrf_cust (customer_t * c, int mode);
+int vrf_line (order_t * o, int mode);
+int vrf_order (order_t * o, int mode);
+int vrf_part (part_t * p, int mode);
+int vrf_psupp (part_t * p, int mode);
+int vrf_supp (supplier_t * s, int mode);
+int vrf_order_line (order_t * o, int mode);
+int vrf_part_psupp (part_t * p, int mode);
+int vrf_nation (code_t * c, int mode);
+int vrf_region (code_t * c, int mode);
+#endif
+
+
+#ifdef SSBM
+tdef tdefs[] =
+{
+   
+    	{"part.tbl", "part table", 200000, hd_part,
+		{pr_part, ld_part}, sd_part, vrf_part, PSUPP, 0},
+	{0,0,0,0,{0,0}, 0,0,0,0},
+	{"supplier.tbl", "suppliers table", 2000, hd_supp,
+	        {pr_supp, ld_supp}, sd_supp, vrf_supp, NONE, 0},
+    
+	{"customer.tbl", "customers table", 30000, hd_cust,
+		{pr_cust, ld_cust}, sd_cust, vrf_cust, NONE, 0},
+	{"date.tbl","date table",2556,0,{pr_date,ld_date}, 0,vrf_date, NONE,0},
+	/*line order is SF*1,500,000, however due to the implementation
+	  the base here is 150,000 instead if 1500,000*/
+	{"lineorder.tbl", "lineorder table", 150000, hd_line,
+		{pr_line, ld_line}, sd_line, vrf_line, NONE, 0},
+	{0,0,0,0,{0,0}, 0,0,0,0},
+	{0,0,0,0,{0,0}, 0,0,0,0},
+	{0,0,0,0,{0,0}, 0,0,0,0},
+	{0,0,0,0,{0,0}, 0,0,0,0},
+};
+
+#else
+
+tdef tdefs[] =
+{
+	{"part.tbl", "part table", 200000, hd_part,
+		{pr_part, ld_part}, sd_part, vrf_part, PSUPP, 0},
+	{"partsupp.tbl", "partsupplier table", 200000, hd_psupp,
+		{pr_psupp, ld_psupp}, sd_psupp, vrf_psupp, NONE, 0},
+	{"supplier.tbl", "suppliers table", 10000, hd_supp,
+		{pr_supp, ld_supp}, sd_supp, vrf_supp, NONE, 0},
+	{"customer.tbl", "customers table", 150000, hd_cust,
+		{pr_cust, ld_cust}, sd_cust, vrf_cust, NONE, 0},
+	{"orders.tbl", "order table", 150000, hd_order,
+		{pr_order, ld_order}, sd_order, vrf_order, LINE, 0},
+	{"lineitem.tbl", "lineitem table", 150000, hd_line,
+		{pr_line, ld_line}, sd_line, vrf_line, NONE, 0},
+	{"orders.tbl", "orders/lineitem tables", 150000, hd_order_line,
+		{pr_order_line, ld_order_line}, sd_order, vrf_order_line, LINE, 0},
+	{"part.tbl", "part/partsupplier tables", 200000, hd_part_psupp,
+		{pr_part_psupp, ld_part_psupp}, sd_part, vrf_part_psupp, PSUPP, 0},
+	{"nation.tbl", "nation table", NATIONS_MAX, hd_nation,
+		{pr_nation, ld_nation}, NO_LFUNC, vrf_nation, NONE, 0},
+	{"region.tbl", "region table", NATIONS_MAX, hd_region,
+		{pr_region, ld_region}, NO_LFUNC, vrf_region, NONE, 0},
+};
+#endif
+int *pids;
+
+
+/*
+* routines to handle the graceful cleanup of multi-process loads
+*/
+
+void
+stop_proc (int signum)
+{
+	exit (0);
+}
+
+void
+kill_load (void)
+{
+	int i;
+	
+#if !defined(U2200) && !defined(DOS)
+	for (i = 0; i < children; i++)
+		if (pids[i])
+			KILL (pids[i]);
+#endif /* !U2200 && !DOS */
+		return;
+}
+
+/*
+* re-set default output file names 
+*/
+int
+set_files (int i, int pload)
+{
+	char line[80], *new_name;
+	
+	if (table & (1 << i))
+child_table:
+	{
+		if (pload != -1)
+			sprintf (line, "%s.%d", tdefs[i].name, pload);
+		else
+		{
+			printf ("Enter new destination for %s data: ",
+				tdefs[i].name);
+			if (fgets (line, sizeof (line), stdin) == NULL)
+				return (-1);;
+			if ((new_name = strchr (line, '\n')) != NULL)
+				*new_name = '\0';
+			if (strlen (line) == 0)
+				return (0);
+		}
+		new_name = (char *) malloc (strlen (line) + 1);
+		MALLOC_CHECK (new_name);
+		strcpy (new_name, line);
+		tdefs[i].name = new_name;
+		if (tdefs[i].child != NONE)
+		{
+			i = tdefs[i].child;
+			tdefs[i].child = NONE;
+			goto child_table;
+		}
+	}
+	
+	return (0);
+}
+
+
+
+/*
+* read the distributions needed in the benchamrk
+*/
+void
+load_dists (void)
+{
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "p_cntr", &p_cntr_set);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "colors", &colors);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "p_types", &p_types_set);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "nations", &nations);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "regions", &regions);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "o_oprio",
+		&o_priority_set);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "instruct",
+		&l_instruct_set);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "smode", &l_smode_set);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "category",
+		&l_category_set);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "rflag", &l_rflag_set);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "msegmnt", &c_mseg_set);
+
+	/* load the distributions that contain text generation */
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns", &nouns);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs", &verbs);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives", &adjectives);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs", &adverbs);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries", &auxillaries);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators", &terminators);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "articles", &articles);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions", &prepositions);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar", &grammar);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "np", &np);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "vp", &vp);
+	
+}
+
+/*
+* generate a particular table
+*/
+void
+gen_tbl (int tnum, long start, long count, long upd_num)
+{
+	static order_t o;
+	supplier_t supp;
+	customer_t cust;
+	part_t part;
+#ifdef SSBM
+	date_t dt;
+#else
+	code_t code;
+#endif
+	static int completed = 0;
+	static int init = 0;
+	long i;
+
+	int rows_per_segment=0;
+	int rows_this_segment=-1;
+	int residual_rows=0;
+
+	if (insert_segments)
+		{
+		rows_per_segment = count / insert_segments;
+		residual_rows = count - (rows_per_segment * insert_segments);
+		}
+
+	if (init == 0)
+	{
+		INIT_HUGE(o.okey);
+		for (i=0; i < O_LCNT_MAX; i++)
+#ifdef SSBM
+			INIT_HUGE(o.lineorders[i].okey);	
+#else
+			INIT_HUGE(o.l[i].okey);
+#endif
+		init = 1;
+	}
+
+	for (i = start; count; count--, i++)
+	{
+		LIFENOISE (1000, i);
+		row_start(tnum);
+
+		switch (tnum)
+		{
+		case LINE:
+#ifdef SSBM
+#else
+		case ORDER:
+  		case ORDER_LINE: 
+#endif
+			mk_order (i, &o, upd_num % 10000);
+
+		  if (insert_segments  && (upd_num > 0))
+			if((upd_num / 10000) < residual_rows)
+				{
+				if((++rows_this_segment) > rows_per_segment) 
+					{						
+					rows_this_segment=0;
+					upd_num += 10000;					
+					}
+				}
+			else
+				{
+				if((++rows_this_segment) >= rows_per_segment) 
+					{
+					rows_this_segment=0;
+					upd_num += 10000;
+					}
+				}
+
+			if (set_seeds == 0)
+				if (validate)
+					tdefs[tnum].verify(&o, 0);
+				else
+					tdefs[tnum].loader[direct] (&o, upd_num);
+			break;
+		case SUPP:
+			mk_supp (i, &supp);
+			if (set_seeds == 0)
+				if (validate)
+					tdefs[tnum].verify(&supp, 0);
+				else
+					tdefs[tnum].loader[direct] (&supp, upd_num);
+			break;
+		case CUST:
+			mk_cust (i, &cust);
+			if (set_seeds == 0)
+				if (validate)
+					tdefs[tnum].verify(&cust, 0);
+				else
+					tdefs[tnum].loader[direct] (&cust, upd_num);
+			break;
+#ifdef SSBM
+		case PART:
+#else
+		case PSUPP:
+		case PART:
+  		case PART_PSUPP:
+#endif 
+			mk_part (i, &part);
+			if (set_seeds == 0)
+				if (validate)
+					tdefs[tnum].verify(&part, 0);
+				else
+					tdefs[tnum].loader[direct] (&part, upd_num);
+			break;
+#ifdef SSBM
+		case DATE:
+			mk_date (i, &dt);
+			if (set_seeds == 0)
+				if (validate)
+					tdefs[tnum].verify(&dt, 0);
+				else
+					tdefs[tnum].loader[direct] (&dt, 0);
+			break;
+#else
+		case NATION:
+			mk_nation (i, &code);
+			if (set_seeds == 0)
+				if (validate)
+					tdefs[tnum].verify(&code, 0);
+				else
+					tdefs[tnum].loader[direct] (&code, 0);
+			break;
+		case REGION:
+			mk_region (i, &code);
+			if (set_seeds == 0)
+				if (validate)
+					tdefs[tnum].verify(&code, 0);
+				else
+					tdefs[tnum].loader[direct] (&code, 0);
+			break;
+#endif
+		}
+		row_stop(tnum);
+		if (set_seeds && (i % tdefs[tnum].base) < 2)
+		{
+			printf("\nSeeds for %s at rowcount %ld\n", tdefs[tnum].comment, i);
+			dump_seeds(tnum);
+		}
+	}
+	completed |= 1 << tnum;
+}
+
+
+
+void
+usage (void)
+{
+#ifdef SSBM
+	fprintf (stderr, "%s\n%s\n\t%s\n%s %s\n\n",
+		"USAGE:",
+		"dbgen [-{vfFD}] [-O {fhmsv}][-T {pcsdla}]",
+		"[-s <scale>][-C <procs>][-S <step>]",
+		"dbgen [-v] [-O {dfhmr}] [-s <scale>]",
+		"[-U <updates>] [-r <percent>]");
+
+#else
+	fprintf (stderr, "%s\n%s\n\t%s\n%s %s\n\n",
+		"USAGE:",
+		"dbgen [-{vfFD}] [-O {fhmsv}][-T {pcsoPSOL}]",
+		"[-s <scale>][-C <procs>][-S <step>]",
+		"dbgen [-v] [-O {dfhmr}] [-s <scale>]",
+		"[-U <updates>] [-r <percent>]");
+#endif
+	fprintf (stderr, "-b <s> -- load distributions for <s>\n");
+	fprintf (stderr, "-C <n> -- use <n> processes to generate data\n");
+	fprintf (stderr, "          [Under DOS, must be used with -S]\n");
+	fprintf (stderr, "-D     -- do database load in line\n");
+	fprintf (stderr, "-d <n> -- split deletes between <n> files\n");
+	fprintf (stderr, "-f     -- force. Overwrite existing files\n");
+	fprintf (stderr, "-F     -- generate flat files output\n");
+	fprintf (stderr, "-h     -- display this message\n");
+	fprintf (stderr, "-i <n> -- split inserts between <n> files\n");
+	fprintf (stderr, "-n <s> -- inline load into database <s>\n");
+	fprintf (stderr, "-O d   -- generate SQL syntax for deletes\n");
+	fprintf (stderr, "-O f   -- over-ride default output file names\n");
+	fprintf (stderr, "-O h   -- output files with headers\n");
+	fprintf (stderr, "-O m   -- produce columnar output\n");
+	fprintf (stderr, "-O r   -- generate key ranges for deletes.\n");
+	fprintf (stderr, "-O v   -- Verify data set without generating it.\n");
+	fprintf (stderr, "-q     -- enable QUIET mode\n");
+	fprintf (stderr, "-r <n> -- updates refresh (n/100)%% of the\n");
+	fprintf (stderr, "          data set\n");
+	fprintf (stderr, "-s <n> -- set Scale Factor (SF) to  <n> \n");
+	fprintf (stderr, "-S <n> -- build the <n>th step of the data/update set\n");
+
+#ifdef SSBM
+	fprintf (stderr, "-T c   -- generate cutomers dimension table ONLY\n");
+	fprintf (stderr, "-T p   -- generate parts dimension table ONLY\n");
+	fprintf (stderr, "-T s   -- generate suppliers dimension table ONLY\n");
+	fprintf (stderr, "-T d   -- generate date dimension table ONLY\n");
+	fprintf (stderr, "-T l   -- generate lineorder fact table ONLY\n");
+#else
+	fprintf (stderr, "-T c   -- generate cutomers ONLY\n");
+	fprintf (stderr, "-T l   -- generate nation/region ONLY\n");
+	fprintf (stderr, "-T L   -- generate lineitem ONLY\n");
+	fprintf (stderr, "-T n   -- generate nation ONLY\n");
+	fprintf (stderr, "-T o   -- generate orders/lineitem ONLY\n");
+	fprintf (stderr, "-T O   -- generate orders ONLY\n");
+	fprintf (stderr, "-T p   -- generate parts/partsupp ONLY\n");
+	fprintf (stderr, "-T P   -- generate parts ONLY\n");
+	fprintf (stderr, "-T r   -- generate region ONLY\n");
+	fprintf (stderr, "-T s   -- generate suppliers ONLY\n");
+	fprintf (stderr, "-T S   -- generate partsupp ONLY\n");
+#endif
+
+	fprintf (stderr, "-U <s> -- generate <s> update sets\n");
+	fprintf (stderr, "-v     -- enable VERBOSE mode\n");
+	fprintf (stderr,
+		"\nTo generate the SF=1 (1GB), validation database population, use:\n");
+	fprintf (stderr, "\tdbgen -vfF -s 1\n");
+	fprintf (stderr, "\nTo generate updates for a SF=1 (1GB), use:\n");
+	fprintf (stderr, "\tdbgen -v -U 1 -s 1\n");
+}
+
+/*
+* pload() -- handle the parallel loading of tables
+*/
+/*
+* int partial(int tbl, int s) -- generate the s-th part of the named tables data
+*/
+int
+partial (int tbl, int s)
+{
+	long rowcnt;
+	long extra;
+	
+	if (verbose > 0)
+	{
+		fprintf (stderr, "\tStarting to load stage %d of %d for %s...",
+			s, children, tdefs[tbl].comment);
+	}
+	
+	if (direct == 0)
+		set_files (tbl, s);
+	
+	rowcnt = set_state(tbl, scale, children, s, &extra);
+
+	if (s == children)
+		gen_tbl (tbl, rowcnt * (s - 1) + 1, rowcnt + extra, upd_num);
+	else
+		gen_tbl (tbl, rowcnt * (s - 1) + 1, rowcnt, upd_num);
+	
+	if (verbose > 0)
+		fprintf (stderr, "done.\n");
+	
+	return (0);
+}
+
+#ifndef DOS
+
+int
+pload (int tbl)
+{
+	int c = 0, i, status;
+	
+	if (verbose > 0)
+	{
+		fprintf (stderr, "Starting %d children to load %s",
+			children, tdefs[tbl].comment);
+	}
+	for (c = 0; c < children; c++)
+	{
+		pids[c] = SPAWN ();
+		if (pids[c] == -1)
+		{
+			perror ("Child loader not created");
+			kill_load ();
+			exit (-1);
+		}
+		else if (pids[c] == 0)	/* CHILD */
+		{
+			SET_HANDLER (stop_proc);
+			verbose = 0;
+			partial (tbl, c+1);
+			exit (0);
+		}
+		else if (verbose > 0)			/* PARENT */
+			fprintf (stderr, ".");
+	}
+	
+	if (verbose > 0)
+		fprintf (stderr, "waiting...");
+
+	c = children;
+	while (c)
+	{
+		i = WAIT (&status, pids[c - 1]);
+		if (i == -1 && children)
+		{
+			if (errno == ECHILD)
+				fprintf (stderr, "\nCould not wait on pid %d\n", pids[c - 1]);
+			else if (errno == EINTR)
+				fprintf (stderr, "\nProcess %d stopped abnormally\n", pids[c - 1]);
+			else if (errno == EINVAL)
+				fprintf (stderr, "\nProgram bug\n");
+		}
+		if (! WIFEXITED(status)) {
+			(void) fprintf(stderr, "\nProcess %d: ", i);
+			if (WIFSIGNALED(status)) {
+				(void) fprintf(stderr, "rcvd signal %d\n",
+					WTERMSIG(status));
+				} else if (WIFSTOPPED(status)) {
+				(void) fprintf(stderr, "stopped, signal %d\n",
+					WSTOPSIG(status));
+					}
+				
+			}
+		c--;
+	}
+
+	if (verbose > 0)
+		fprintf (stderr, "done\n");
+	return (0);
+}
+#endif
+
+
+void
+process_options (int count, char **vector)
+{
+	int option;
+	
+	while ((option = getopt (count, vector,
+		"b:C:Dd:Ffi:hn:O:P:qr:s:S:T:U:v")) != -1)
+	switch (option)
+		{
+		case 'b':				/* load distributions from named file */
+			d_path = (char *)malloc(strlen(optarg) + 1);
+			MALLOC_CHECK(d_path);
+			strcpy(d_path, optarg);
+			break;
+		case 'q':				/* all prompts disabled */
+			verbose = -1;
+			break;
+		case 'i':
+			insert_segments = atoi (optarg);
+			break;
+		case 'd':
+			delete_segments = atoi (optarg);
+			break;
+	  case 'S':				/* generate a particular STEP */
+		  step = atoi (optarg);
+		  break;
+	  case 'v':				/* life noises enabled */
+		  verbose = 1;
+		  break;
+	  case 'f':				/* blind overwrites; Force */
+		  force = 1;
+		  break;
+	  case 'T':				/* generate a specifc table */
+		  switch (*optarg)
+		  {
+#ifdef SSBM
+		  case 'c':			/* generate customer ONLY */
+			  table = 1 << CUST;
+			  break;
+		  case 'p':			/* generate part ONLY */
+			  table = 1 << PART;
+			  break;
+		  case 's':			/* generate partsupp ONLY */
+			  table = 1 << SUPP;
+			  break;
+		  case 'd':			/* generate date ONLY */
+			  table = 1 << DATE;
+			  break;  
+		  case 'l':			/* generate lineorder table ONLY */
+			  table = 1 << LINE;
+			  break;
+		  case 'a':
+		          table = 1 << CUST;
+			  table |= 1 << PART;
+			  table |= 1 << SUPP;
+			  table |= 1 << DATE;
+			  table |= 1 << LINE;
+			  break;
+#else
+		  case 'c':			/* generate customer ONLY */
+			  table = 1 << CUST;
+			  break;
+		  case 'L':			/* generate lineitems ONLY */
+			  table = 1 << LINE;
+			  break;
+		  case 'l':			/* generate code table ONLY */
+			  table = 1 << NATION;
+			  table |= 1 << REGION;
+			  break;
+		  case 'n':			/* generate nation table ONLY */
+			  table = 1 << NATION;
+			  break;
+		  case 'O':			/* generate orders ONLY */
+			  table = 1 << ORDER;
+			  break;
+		  case 'o':			/* generate orders/lineitems ONLY */
+			  table = 1 << ORDER_LINE;
+			  break;
+		  case 'P':			/* generate part ONLY */
+			  table = 1 << PART;
+			  break;
+		  case 'p':			/* generate part/partsupp ONLY */
+			  table = 1 << PART_PSUPP;
+			  break;
+		  case 'r':			/* generate region table ONLY */
+			  table = 1 << REGION;
+			  break;
+		  case 'S':			/* generate partsupp ONLY */
+			  table = 1 << PSUPP;
+			  break;
+		  case 's':			/* generate suppliers ONLY */
+			  table = 1 << SUPP;
+			  break;			  
+#endif
+		  default:
+			  fprintf (stderr, "Unknown table name %s\n",
+				  optarg);
+			  usage ();
+			  exit (1);
+		  }
+		  break;
+		  case 's':				/* scale by Percentage of base rowcount */
+		  case 'P':				/* for backward compatibility */
+			  flt_scale = atof (optarg);
+			  if (flt_scale < MIN_SCALE)
+			  {
+				  int i;
+				  
+				  scale = 1;
+				  for (i = PART; i < REGION; i++)
+				  {
+					  tdefs[i].base *= flt_scale;
+					  if (tdefs[i].base < 1)
+						  tdefs[i].base = 1;
+				  }
+			  }
+			  else
+				  scale = (long) flt_scale;
+			  if (scale > MAX_SCALE)
+			  {
+				  fprintf (stderr, "%s %5.0f %s\n\t%s\n\n",
+					  "NOTE: Data generation for scale factors >",
+					  MAX_SCALE,
+					  "GB is still in development,",
+					  "and is not yet supported.\n");
+				  fprintf (stderr,
+					  "Your resulting data set MAY NOT BE COMPLIANT!\n");
+			  }
+			  break;
+		  case 'O':				/* optional actions */
+			  switch (tolower (*optarg))
+			  {
+			  case 'd':			/* generate SQL for deletes */
+				  gen_sql = 1;
+				  break;
+			  case 'f':			/* over-ride default file names */
+				  fnames = 1;
+				  break;
+			  case 'h':			/* generate headers */
+				  header = 1;
+				  break;
+			  case 'm':			/* generate columnar output */
+				  columnar = 1;
+				  break;
+			  case 'r':			/* generate key ranges for delete */
+				  gen_rng = 1;
+				  break;
+			  case 's':			/* calibrate the RNG usage */
+				  set_seeds = 1;
+				  break;
+			  case 'v':			/* validate the data set */
+				  validate = 1;
+				  break;
+			  default:
+				  fprintf (stderr, "Unknown option name %s\n",
+					  optarg);
+				  usage ();
+				  exit (1);
+			  }
+			  break;
+			  case 'D':				/* direct load of generated data */
+				  direct = 1;
+				  break;
+			  case 'F':				/* generate flat files for later loading */
+				  direct = 0;
+				  break;
+			  case 'U':				/* generate flat files for update stream */
+				  updates = atoi (optarg);
+				  break;
+			  case 'r':				/* set the refresh (update) percentage */
+				  refresh = atoi (optarg);
+				  break;
+#ifndef DOS
+			  case 'C':
+				  children = atoi (optarg);
+				  break;
+#endif /* !DOS */
+			  case 'n':				/* set name of database for direct load */
+				  db_name = (char *) malloc (strlen (optarg) + 1);
+				  MALLOC_CHECK (db_name);
+				  strcpy (db_name, optarg);
+				  break;
+			  default:
+				  printf ("ERROR: option '%c' unknown.\n",
+					  *(vector[optind] + 1));
+			  case 'h':				/* something unexpected */
+				  fprintf (stderr,
+					  "%s Population Generator (Version %d.%d.%d%s)\n",
+					  NAME, VERSION, RELEASE,
+					  MODIFICATION, PATCH);
+				  fprintf (stderr, "Copyright %s %s\n", TPC, C_DATES);
+				  usage ();
+				  exit (1);
+	  }
+
+#ifndef DOS
+	if (children != 1 && step == -1)
+		{
+		pids = malloc(children * sizeof(pid_t));
+		MALLOC_CHECK(pids)
+		}
+#else
+	if (children != 1 && step < 0)
+		{
+		fprintf(stderr, "ERROR: -C must be accompanied by -S on this platform\n");
+		exit(1);
+		}
+#endif /* DOS */
+
+	return;
+}
+
+/*
+* MAIN
+*
+* assumes the existance of getopt() to clean up the command 
+* line handling
+*/
+int
+main (int ac, char **av)
+{
+	int i;
+	
+	table = (1 << CUST) |
+		(1 << SUPP) |
+		(1 << NATION) |
+		(1 << REGION) |
+		(1 << PART_PSUPP) |
+		(1 << ORDER_LINE);
+	force = 0;
+	insert_segments=0;
+	delete_segments=0;
+	insert_orders_segment=0;
+	insert_lineitem_segment=0;
+	delete_segment=0;
+	verbose = 0;
+	columnar = 0;
+	set_seeds = 0;
+	header = 0;
+	direct = 0;
+	scale = 1;
+	flt_scale = 1.0;
+	updates = 0;
+	refresh = UPD_PCT;
+	step = -1;
+#ifdef SSBM
+	tdefs[LINE].base *=
+		ORDERS_PER_CUST;			/* have to do this after init */
+#else
+	tdefs[ORDER].base *=
+		ORDERS_PER_CUST;			/* have to do this after init */
+	tdefs[LINE].base *=
+		ORDERS_PER_CUST;			/* have to do this after init */
+	tdefs[ORDER_LINE].base *=
+		ORDERS_PER_CUST;			/* have to do this after init */
+#endif
+	fnames = 0;
+	db_name = NULL;
+	gen_sql = 0;
+	gen_rng = 0;
+	children = 1;
+	d_path = NULL;
+	
+#ifdef NO_SUPPORT
+	signal (SIGINT, exit);
+#endif /* NO_SUPPORT */
+	process_options (ac, av);
+#if (defined(WIN32)&&!defined(_POSIX_))
+	for (i = 0; i < ac; i++)
+	{
+		spawn_args[i] = malloc ((strlen (av[i]) + 1) * sizeof (char));
+		MALLOC_CHECK (spawn_args[i]);
+		strcpy (spawn_args[i], av[i]);
+	}
+	spawn_args[ac] = NULL;
+#endif
+	
+	if (verbose >= 0)
+		{
+		fprintf (stderr,
+			"%s Population Generator (Version %d.%d.%d%s)\n",
+			NAME, VERSION, RELEASE, MODIFICATION, PATCH);
+		fprintf (stderr, "Copyright %s %s\n", TPC, C_DATES);
+		}
+	
+	load_dists ();
+	/* have to do this after init */
+	tdefs[NATION].base = nations.count;
+	tdefs[REGION].base = regions.count;
+	
+	/* 
+	* updates are never parallelized 
+	*/
+	if (updates)
+		{
+		/* 
+		 * set RNG to start generating rows beyond SF=scale
+		 */
+		double fix1;
+
+#ifdef SSBM
+		set_state (LINE, scale, 1, 2, (long *)&i); 
+		fix1 = (double)tdefs[LINE].base / (double)10000; /*represent the %% percentage (n/100)%*/
+#else
+		set_state (ORDER, scale, 1, 2, (long *)&i); 
+		fix1 = (double)tdefs[ORDER_LINE].base / (double)10000;
+#endif		
+		rowcnt = (int)(fix1 * scale * refresh);
+		if (step > 0)
+			{
+			/* 
+			 * adjust RNG for any prior update generation
+			 */
+			sd_order(0, rowcnt * (step - 1));
+			sd_line(0, rowcnt * (step - 1));
+			upd_num = step - 1;
+			}
+		else
+			upd_num = 0;
+
+		while (upd_num < updates)
+			{
+			if (verbose > 0)
+#ifdef SSBM
+				fprintf (stderr,
+				"Generating update pair #%d for %s [pid: %d]",
+				upd_num + 1, tdefs[LINE].comment, DSS_PROC);
+#else
+				fprintf (stderr,
+				"Generating update pair #%d for %s [pid: %d]",
+				upd_num + 1, tdefs[ORDER_LINE].comment, DSS_PROC);
+
+#endif
+			insert_orders_segment=0;
+			insert_lineitem_segment=0;
+			delete_segment=0;
+			minrow = upd_num * rowcnt + 1;
+#ifdef SSBM
+			gen_tbl (LINE, minrow, rowcnt, upd_num + 1);
+#else
+			gen_tbl (ORDER_LINE, minrow, rowcnt, upd_num + 1);
+#endif
+			if (verbose > 0)
+				fprintf (stderr, "done.\n");
+#ifdef SSBM
+			pr_drange (LINE, minrow, rowcnt, upd_num + 1);
+#else
+			pr_drange (ORDER_LINE, minrow, rowcnt, upd_num + 1);
+#endif
+			upd_num++;
+			}
+
+		exit (0);
+		}
+	
+	/**
+	** actual data generation section starts here
+	**/
+/*
+ * open database connection or set all the file names, as appropriate
+ */
+	if (direct)
+		prep_direct ((db_name) ? db_name : DBNAME);
+	else if (fnames)
+		for (i = PART; i <= REGION; i++)
+		{
+			if (table & (1 << i))
+				if (set_files (i, -1))
+				{
+					fprintf (stderr, "Load aborted!\n");
+					exit (1);
+				}
+		}
+		
+/*
+ * traverse the tables, invoking the appropriate data generation routine for any to be built
+ */
+	for (i = PART; i <= REGION; i++)
+		if (table & (1 << i))
+		{
+			if (children > 1 && i < NATION)
+				if (step >= 0)
+				{
+					if (validate)
+					{
+						INTERNAL_ERROR("Cannot validate parallel data generation");
+					}
+					else
+						partial (i, step);
+				}
+#ifdef DOS
+				else
+				{
+					fprintf (stderr,
+						"Parallel load is not supported on your platform.\n");
+					exit (1);
+				}
+#else
+				else
+				{
+					if (validate)
+					{
+						INTERNAL_ERROR("Cannot validate parallel data generation");
+					}
+					else
+						pload (i);
+				}
+#endif /* DOS */
+				else
+				{
+					minrow = 1;
+					if (i < NATION)
+						rowcnt = tdefs[i].base * scale;
+					else
+						rowcnt = tdefs[i].base;
+#ifdef SSBM
+					if(i==PART){
+					    rowcnt = tdefs[i].base * (floor(1+log((double)(scale))/(log(2))));
+					}
+					if(i==DATE){
+					    rowcnt = tdefs[i].base;
+					}
+#endif
+					if (verbose > 0)
+						fprintf (stderr, "%s data for %s [pid: %ld]",
+						(validate)?"Validating":"Generating", tdefs[i].comment, DSS_PROC);
+					gen_tbl (i, minrow, rowcnt, upd_num);
+					if (verbose > 0)
+						fprintf (stderr, "done.\n");
+				}
+				if (validate)
+					printf("Validation checksum for %s at %d GB: %0x\n", 
+						 tdefs[i].name, scale, tdefs[i].vtotal);
+		}
+			
+		if (direct)
+			close_direct ();
+			
+		return (0);
+}
+
+
+
+
+
+
+
+
+
+
+
diff --git a/data/ssb/dbgen/driver.o b/data/ssb/dbgen/driver.o
new file mode 100644
index 0000000000000000000000000000000000000000..11ddc431f6fe96551e2cb1815bf6424d62eaf0c3
GIT binary patch
literal 41400
zcmeI5dwdkt`Tr*nFhtA-MH>a}RD%XfV!}0v6cY%rXn>SkQJ^90Zb&5Ajk_Bz-T*_;
zZ3s}WShZ4%g?i=d712Tv6uf*DuSKPbmr^3Ah*m*SzRz>!IlCVYnOOU~yuN?*yx5(Y
z_j%6eJm)fJ&di)F^M{v?=#Z3T>5ya%u@Xm`TGrg#6Z=JKx5zr#>TZ4EIpdqM$=-&2
zDI1yY;V)+1hF|cmb#>nOCwaT~1aHIUByYj#3ZCX|>cl%u{Z}-875)3hRN7Zb_r5h{
z^JY$8{Hx+g#S@DsOdL<zReo<%a<42h^2SOsC+D@SV;5tGUfbne;XO+!c@|wrEcR`2
z?5on|UKDGMiv7K`Svk?#%qe}Fz0JuByqk6<508C5x+yuiG<Ilx8O3a1vC`OQ>uBS6
zK?lkkZ}Z$FXD9ZIx3R^u=pkkKsW-OO!M=A$^A1kuZSJ1o-Sk27s92xw#Z6PY7d3o)
z#Ixu<j!9%(ytO2`=m>@4hVPGf8mnm!Vn;P~PxdwsV6~0xQo<i^)5%OXu(xsF8e5`Q
zZ{UhU((zf#S}-`%@-*g?xKh|#P`vJsq~~oufknK{juz+OZBq4<$Vla@il&&iX+q{?
zzxU_eRo&P3@HT9znli0;YVow<Ul*5eE{W~myVJ?z4V8!+PBmDHw>ev77W>rOteV^#
zdrxX|t9T$q6-ssb9|P*%|EUMvZ~pi9hlVXFQ>K}t)xv#M9cUo>&Krwnri_ZcF)H>l
zj9!Z}CsS-*OOray$)qQJGH49!;q|@bjcxbd>iznV=fV9YvDV(7$4WCZXuRtwLuoW~
zbmJ$UMZsjt8XeoWwIs7U4<%mTGo{V_7mV7pJGu2vI;l6NQw7RNrA9^?xT5)T7M3>u
zaRntun~oem(z|f;8cH{+U}yaYTU9b1ANDRSUf0_0(4jI)j2gx3_Wc-hZ|rUFrf)9r
zZu)o#jiV<u?CR+4y<LU}&xWoY+8wJ}UnCE`>o2-^asHdn)b=lp9rS+vG07dzdz%hk
z;O+gixBAueb9jI}$hxtoWysM(jjq=&qZ^NSmhlBnNPVX4corQdx`944ZYIXcBWehh
zi-cC`p$FL54)3iOetl(Ly0_t*B+t@jYVc7F`;t719-%#NQwep8=Bb^m(%v8Hi==1S
zf$xtT(R$x?nzJglM~{-e<ZHZ1+lL(4^#euqm1Jgk3!c8}oCfL`Nt8?X0_r-R=HApt
zMlIaR^}t<*i_&^Bv7^nFb!fxvQ*x>__C1zdd)1__B}Z<2el4|kD|_#4ek5}_*OQ#L
zx&|BHph#Uur+b^9QdX);)uqF!%hm^#gcC`5I+q{kQss?j*22y=N@kq%F_~4FnX%O|
znGKc9IMOw<bS_zjWM;RO*|*;rC1a~&GJAupxY!)&lG%jk?bZjBSv1o=rc5_NHm7-;
zugtWM$>TDW_^6m;T539%$IFJXx4g~j9KWz>LM~mZ)mUUqIixeT4y6RnY^(LPU0s|3
zrO6q^Xky?@%IZ(_F5JSS9?T8Yz~D>-_fZxqVVT~RI#ZES2kY~uy@`qRf%Uzp2dMd)
zbV_HSb2`X+o^rTG_GvHjto9<${kJ3A_4cR#wrtkFoieArl$q^Cp4ndH&)bX4YcH~A
zo00!YPdKT~l+qJ=w--6Ey~uXGZMXXRwkO+e*K9Yk-L84Yzn$mM_EL6hFS6YeiLWom
zo`MZqGMs6Wb32zF`!4ncOp54=nAeiH7G6gh@uNjg(NU@I!%?warLp(rM#wAEC%H?D
zJzaV}pK6w85nsfcBQz=c%1g5-Z^N@mGIHoUI(=DM_B*mv7vuOa=4o6*%(?P3Y)R+A
zY_r!#w=2o&u2BZm(x%c5DW%TsYwUA@PLa*Ksn&sXQH{N;hf+TBc1+HU`*#5RquGhJ
zo6L67kvo$|$JA$Q30bI-@0dH~jN;gLZZ$i<yROx%r>T4L)-SmZ+9+X3hZJ$3-STO<
z`TwQ&Jj*^()vAX3*wcK=?BOe?kEULI8V{4(&Et;HT&%P?v+*Du$qjm`a|tSSZW>$9
z=OoT_OO;Z6Z6tH2(E5TEe`w%!W~`JmwS7{F)MMg#1Uz~0Ed3?P$kPY%X@$~`4X3W4
zd~Ql!(QxocbRo@&SLi}ry;1txMn3;lg1z_Y24W|UX&@0(70`Hurx5XOH<8Q6b3o{J
zK4QAtWxzF5`_wh7G<KlvF7{PPX7YSb<3jSn+we(pX~Ey4H}Di8*(uj%Dop)5I``-+
ziZ)pd2UDYWH5^QiW;Pr=rT$4>mJ`}4%ii13jc?j^qcuvk8hfiW_Ho<sO{JO1v26{z
z<~Ot?4NjV$v)~XpeG#ScJoex1cCps8$*AG|`RyAesWxTzt@YfD$23T5j2b}|{+Kf+
zr8suj8~dm9z4(2SniK7<D|OE4(wOdg@drX%y)o5e`1UE4=5<cLxS)95A)e*`sAnzG
zf2j~O-Ts)K0hGl4q2#FbJd0kV8@7f$!x|0_@?2}veyqjoOK$Kc?Vx*t^{V@Ex#dxg
zO6EAW_u+P9Z)z`AwWqj6ozrUdlW%I)iTPA|a#R&~esd=I<!S6m-_lv6a$5Uv%Oq|n
zTYK`cGq!miOKR9tG5EweQO{#7(!NwOeYfKzO5x1^!DkSN{55e*Ia;k-6UTh>+xmFo
z*yiKaiDSNpYrTb!+nZykT1?#49#ilGo^Gn(2R!NOPO#E*-u5~34|qL&TBP8Wp3hq7
z_}mtXa_VUD&nVZUC%%;2=|b+kg)3iqqtDScry#A*@U%WrLo&}UZC<dQI-@Gc-gIv)
z&D+pYR$4%V@=fYC;92tWIrcK;fal_E?EWj>7}*{2Hokf_-7mkg_XW(`>)FuF+hjE*
zFK^i6Z#a13oVl@<QNHBGrAhm#D<*Pp9dxuet>+lWKQWGvqhm;ZkggMO*n0Hg+jRJ2
z8V{<a_4l0Ck*0dJT9?vs+ou6NS9Z>#)=OP&#9)-GK|-V#+5Brm$q9NWY5n|LdQQCV
z_KrV#H1Ro|s;T4|%d?>@Nnb9xE2*oD)24DucBxhT1lL8ey2<-j#;qT5*JhkY`m||1
zg^H#JiiU$p(Js#A@krD|PnhYMTKoyKw|Q_Qd*`?uA8_k;6tBCl-HI`CR-?W%`LvIk
z9%azP)9DZ|vR<NZ{P;ac47&*(fXD7~<LDZZVkS~Ldpy)Tdo+f|tB)tANqW>dR+lT!
z6k;ZpD%(W7Q_V70nK#cll<p*69et#yHI3Y{^fbe>=w`kII#*6FJz86IIUOr8{RB#N
zIsIn9nf1wAo7TUmc2!-t?Eh)!q^_3UdMZ_AX)`?+dxBiF);rDMJP%<Pl+$`FS3Vu^
z{90}G(5WUJG<<GSz;kZ=_<*O+Hm~nFZ%h&A1&%uX8eg4%j-ERdC@s&@v6QiXW#YUq
zX?={cZw||y%;$;8tTRt-u9FzYSF&|{bz&R#Bzaft{hji@IJQZ5tY&YL+_-q%CTe@j
z!1(C9t&Q;t(IzyR(*-%-DM<5TawOglmR&)$o9JC_?a5_L=M;O1ZrhyOqL?a~w_#Hf
zWUTw!(n;zRP*uA=agQXA9$Wd{SE9z`R-*NCzU*j)Mf}L@sP6bAH~!x4{N$7>ik`i9
zZ#dM&vuqUE#dg4(#^~QRko=L@%c^>5_V?fV+MGrL$4osF(+KCLwehRDgT#sYt|HR9
znyXE2`LfzlQF-ZPT_oAw$PcICF4&wL<JX>a+tHD(?y9BY^|0(x?s`tvx)mHpo3704
zuCh(o-axjx0vlJ+fmvBE5>povtgp94AnEEZ4Tp~NEd4Su9V2s#*|LeMJ=J8&>)RB+
z<&OQ8`oop%wR-s8_?)Nl_mqUrwNH57GAo@fDK(iX`V4(q`KwRS1D?sMtd*Xu9f_G*
z*0xH|gy-qF7YUJ4&ik|b@zjTAYR+Vc;<Mg5;CXsS=Y-LOod-NqU#Rp<+|H4eo=I<0
z9NomxiN9GT*0Y6GRJMjg$)3jBxR6Z^WmG~PcO4<+AfJ-H37OsLo$>Yj&RD86e(`l$
z$WyYdw5jAIdU&d;rR^K5)?$^9aUzt;{X^?mj`dMjrW$SL@=}&5oFyxJ<IUdvB)aJ^
z9ssx6oK|<aGx(l0BYw|Xf3m(`wbf%1Rjj@5s10Y62)!aXz5JFxTs=Dw&hc5JD{5*&
zKD)A_x+dVaE24I<NMSF3s^zbaL?b!=NW>}`Kfb(V)c6TjUAZq94O_lYO(-0pqv*W4
zK*S1GM5{x=h!qaZRNGK_s4iR`vZ{lTXt>@NwIa14f57rpL<2KJ;dxfLvZi9DRT~M+
ztPMu3V5pwbk}ESJR)zo4fG<jZ@vflyxz#l_72#@P(LlJinsVdJE5gxgUk$;!aG)*}
zsdikL8Lp_Utq5DeI%{^F)ytP+O&ni5a(JQDE0Rk8x~L5mFg4Yhl0DHLTwmuWuOdUH
z*;BH^_Taie*cS*!hfK4wFS9vhXWM~b#f+MO{mY4?hEK4$AgPv$QeTw1!j1%@sa6;N
zjG2L;ojtp9gq<C+^MKftQ=UM7C>Y2|wT4wy*ZA$4P=!Abwu7Ol?F$FU5x+I3f=Uvc
znUj-ar49?#lM%-<wL?L>uG(*N)1;P#LtIpQM*U3Z$QOu2D5}u5!oJylJ5oI}SW%;F
zBhgS@UBKT58GDgxO$^QohUNtAs48HvqBg)aV^swGc6CJMJ1-EmBlUH4p)mPIZjT(w
zF_9>>h+Q4D{ejtmnowOWRaqaZVY}CWoZL#*;CfEzhw8X-?05#l$Bi31uFw{IPA}g%
zcD>^YWifvI(9w3*_-IAg9`CCP)K=I-13_OEHR>#Sf}B@S(BICs`{w56TEjwh^TO3L
ztGL4|$GNLcs;H@^MxiRSMh1d`u(MlA6>V3{P{sIJD!Zy0t}+znv{R^B3+-Mt{%KYn
zb*r4{j2gA$RE)(HP*Y8(3UhrBkw{-X)o-mt>H{6geM_a{9Z67A9SnrR)a)uEek8H#
zBpn{4j&277bEtx;uAS4%7H8RUYGn?o;{ySIL`Tqx8xGC!(b+^5QbV1DdU(Vh9Pw4u
z1cpqTl0A&NcR1vuzCI;;Jn?9tj`}$Vr{_+|zJmI0$yIzRjJ2<*tg4O7zKU!o*jLo~
zBL13+s~qcu{cJJylWka~OXSD8*)u4cA$$&TN4I%A30KdkS7)m7nqqkd&8`KNU40~=
zcG*$t&;hE*kWHr*w_`xH8QUjLzbO-gd;*n>9pA^UrIxT~1Z>jq+jFX;Rd)9HX>4v~
zm#7rvhd-o5W>io<$|vf?+yu(Z%JyqNBGlkfs{NV(4Gtt4ofDvQo1LtrL6Xi96|yU}
zF13v>kYkUfA!|;!np_RcrNUAtJlcMQwvQKyM#89LV~6Uab@eomkiS*hj;gG#redBQ
zt)kPemTIJ8Ci$9OeYBr6&IZEKcKjsQopz`Wa%pwZscyM9soIdHaLP5-_LB!xUvWRj
z|FYDM%nL><=BkF1n$h(z*5;ZZ`w&$?cDUM4Z561jsIQ?xOO;9$Q4IiGchtSOglwOP
zjW}&q72qM!sbR9Kb?m4=`0ITEn=U-{wLuyh5|zdEHP#Lr{>}=_v%?j^nbdlG-iW_6
zyl^1<*^d3BK)AYco--KGD6U)}X<cq^!0M=^qI7hHkbdf1HtcYqG8~9h*;&E!@^W)~
z_v&SbDygw_f1=aDv1je^d?MN-DtvVLq5*D1Z|>X5K&sK=)6%h@QO}nL5@J9-nNL=c
zR?eR1T(_Jnk42}b&nUmIUX4mL&Q;Us9O2>D8Rf^0DZMzJ-vqnPNXO%!Ya+@gV%R5!
z!*(*FbY_221EO0Jvad1nNsN<~RDniW_(JaV?|6UZ<3t_F37gHWj@R@_!^aIBJASxa
z)vfHyb$nbWGAl5Lh7&$bjvl|b4pS$<;KYqJe#9^G?5w<zLwonJXUm0%YCe9F)rIQn
z>4c}&=)=QQp)|x%Ix-_s-L~Q=#dwM?e05Z#3-us1j76d}FwoVW`v`qImkORks1P@&
zC?8Ke!Zeuaq`e|)HX-u|>S=1WKo`qkC|pZ3!+ELJn6VRv7uqFU2U0QIpEwhBaiN(b
z_vwf|#L3ROI8+~|QzJs>Kc7LW^7srGU3{@kTK3T4_OP*|%SuNTkC{MMW4;`h#cReM
z=?wRLlF;d?x^6E&O<3t)uSjpMvI%r%M<`GBf+j-jI_Fas2$N5>6+s#(XzCR7Ro6HU
z%_tmJSmG}nQCVGB6)YTESXOvhxG+*UzHma}#KPH^7mC|3VbFc2uA(|@pB1lBJyW2G
z0*7H0L7tpRO9yN^d8onyHBN~WRi*Xy+=SewnT>B&q`p>~O3k|c_Q;_$4auD=-9jh(
z&-Q;o4P1F%_Ea}FicX;CV?QGB9rpjvu>H6nmm5DyX)i@?eS5LT#{H1q;V#FfX^;Gn
z-u_Yi;P77$Kcx4sEB(hZQ3Lzc(>}}UUgWU-7pQH~GUxcZ%|)LTeDnR#oZSw$a)&u~
zmc{KsVZJ`LMh+WRXlG5FLAN&bc3xgi|D4?Hyn1z*cXi*sIl29N3sy<^1th<8<YzV`
z$4s=SH?%O*XQ^6nNykZ-kF%2IrYH62dR)qKIw>=V@!}^A%4T?4dU0AxNg9c9T)v92
ztRmiyO-pa+5K2oa-fWf9;RI$Bp5T*vL|Xc?4nxy2mUbMPW;Z7fP0L!+sW>fnap&T+
z{D$L7(pDyQY)VQ^%O_TxmPIyXJ2WkQD5)J!$+PIQias3v$z7TJSlppFEu*1hNt)ep
zPlvROV&xe{ao*eM!=VQkxc$6GrmgJM@vCI_yt(|mDw0E2t~762x;HJuOZk4$F)hW)
zB~&Ewyl#T?Tf)9A?pT~=Hzb#&<#xC`HO(eMJ~}1xl37&J^yNlq-^Y@)vW}bFs)y^j
zMYO*9;qsQG6?Occit8PkR@ULt<7c`Syohu&WZ0<T@$#5*6?MEf$<Zk%(P5p8LQ)d9
z!0sknpC4%}zN00DF(Nxsinc;W)vJ$=;rT)m*{@A;xK7DS#+bKA$KNjz{OmHxI?m!g
z&q3Ssen;YF;6A}RYeYxeb3b*?GPM)`%2s>)3gGC#tMS<0j2R@{qqd2!!NSi;AhlO4
zoYw_n7@-b2S@8*-L|!D^NtZ}bCY<{xhKcH+ljX!GVlELrIf2yPbm7ic9jLEL;Z+Hw
z_MB;HC+jkUhsA!5!RHEJVDPJjFE;oh;Y$tv8{sz^e7SI5euv>^b<oMW8zJrAF8qFj
z-z8kVNrbpH!k+=gyAKF|$>5I&f5+hKg@0)97U8=M{;cp;gTJVbJF9UPqh1yJ&c-RZ
zL+n?gG3~!A-0VLe39mEkKUT+`tWONSU-*{>|4JQqwze7kuy7|`BC><>yK{W})-fr<
z`Nao@G~sg%-d*?_gP$gRhrw;(e4)i~ws70veT0`8ysz*D2G1A1*5E^g?=pCa@Dv%Z
zv`_rkh!%}LxdwOYrL#5N;FFYnCo5BJwTfj;6`d7^{S2`$Hteg!ew$$*6n@a)&b5;K
z)B~+#eYx0I8GM0o{xc#Bi-m7D_!8mfrQ~|yxqMxPutK==>MjxSd*O2pdwvVa^}NR5
z_bA@UQuBs{jI~zm_Z#+)iTzBqO(b|q?Cs78#(PhT{gsCOHnA@=>|YXIXYkiWXSJd8
zw(#u+-zB`k;Jbz2Zt#7=9e)#LI3WB}!~US~cMbl%@D4`*PgcjBtuo_Mo~k&{Fpm!D
z!c&gM)+vg2vQpGG5!qAh(-TPTohdxS;8~*cQ$r_5c%i}jtK-hra-&@wyH3_5!@fxD
zTMYXVig&X3&tNc=3OC1@al+Rd_Wb98Ja4(f;J;SKovr;wzJ9TP(y+f&>}{i6>coDl
zVLw~!cNq3pDSo_VW1(n&k=V~Q?3)ztWbIJfM5613A29f>>X^=7LuaMp#~*XuT&;N5
zgs|*AAojUN%%h5TvUaL%BJwHW|1kJw;Yr3g`8@FxX=bWJciDNJxY;*3>MgO?c12?U
zA@OcBzjfwWmZi^weZs{o&Odk2|4Q^HNCI^)$ou3s>n{+l^*bw`c=%yU`MT1!E&oq*
z;XiZXSuXrM7e3g94|U;Q7e3a7PjcaY7hX@C+xG;jR}Si4n6?XC?3;*lJ?DzO9*1sr
zv0o|n&iUuu<8wZ%g_jvR4+~#TTMk<PNo{ZN&BFD3L)*V9Jl(K=U-*6_-%o^ZH~3-U
zYYm=4*L$vq<pw`V_<lKMv_EGHuQT|m!u9n|+h+=2YuNV@o@?-Y;bjId5uR@NIa>H~
z!~PQC+nwr<*Z&NyW7r3T?>Fpu{>1&NgUmP7a}%<u6Z;~ur$ZGM&`w+DV@&Mz`L6A6
z5ME~J-0q@tx7e>0dp$nf?_%FT*BjNohW+C%_8VRJRu}%V3;(MN|A08x=jrm|LOnMk
z-}boJe<t?5)0_h6`QS~|9$c;#qg-FR=p>1~ZW~>1U5K+kSIYwh^~{MhJ;b%GWT7^v
zxbV|m_?a#|%Y|pV@FB$6&$TBy-l*qUv^&Pdej;(MAFnvC%S(T_-e$RYU2t3Fy;O9Z
z=PQZlKbH%iEc#UKD$IA$SuFNhVyFG#1+VPq8iW5<xVF>V+g$V?6np1+Ny5*^h1VJS
z8-%Yk_%p&+82lySRl;?7-xPkGVgIi1rwseu!lxVdpAv7Y{{Kmw+x<^6psMGMl=d6Z
z$v5=V<$UjN@GRlE1|LMct$c@xy)E{-{x5W~AMe7axbPXGFU5+_m#SUtBQAWN3%|;R
z|Hg%1@4|oQ!tZk7_qgzfUHFqOe6tIG!G*u(!gspxk6idZ7yhLSKkULg(fvSM{pti4
zezFTc!-b#i!gE~sKo@=iaUOrx8Rx|a;dREmE3p)|I!McX`dXo!JgX)cs*cd2)lh96
zuLASu<XSnU!_-$E@Ac)ce*D#+zXtHvK>ixUU-|r1z+ZVz5|t)TCCXE&@>H@sl`c;u
z%u^}zRMI?^wy#Rt*U3+%?W@xERcZUGw0%|DzA9~BmA0=++fUi_Q#SpSPCup7&v9Jo
z^iyg3skHr6+I}i+e`Via+4ooW{hg93&HhTWze?L*rR}fM_E%{KsI&uA+5sx<0F`!t
zN;^QM9iY+<P-zFKv;$Pyfhz4lm3E*?J5Z$^sL~EpX$Pvb16A6AD(yg(c92RtNTnU5
zEC;DngOueUr8Y>Z4N?k&oK#97U!^TjBws1yEBkz9pRaWCm1e$5o3AwUowO=dfuo?(
z7O1oZ%DzA;6exuPm9{{oEl_Fm@>GJnTxFG)tITL2nbU8SKwhp&m6xlM<>ji}@^YQz
zxsqIEm`6R0SMQYb;z^4ZAV#fOv`)927YJE;*^fm_T_cKHv?8Zmt!A{;$}lTROZFmG
zd3l7^ZU>{4<-V#}q!+FU1T9_%M4EKutD8qAM_Kr*gI49tKs2J_tHQLQ6?|bbs{}jJ
zjaK+(v1&Q5G`HBTa<#<V3Iu1D`$ECW>Y1{@yqtWa)#PGeEH_t?$1XYN73Z|{a-NE=
zDc5Vy<xs6amwmksT@KZ1bhYm*=QZhS*IAVQe|b4Ntq3aTHN?((P0D8`ubhrpwX@3A
zieS|w5nj`4=`}?bE$OA23(<0RS|A^y0m%Wh{)?0G@;l!0!eid@@;%<tlB<B_udl76
zOjxd-7miw$YHg}o+FV^hOPQ$yP@|9~tvM?X*4J8fs;|%@_;84tQ7s|2Xi3#9tCCix
z`qbKVYF3I4hNxRnJ<<v+T7R!rV9}a?wXoNsZAEyd<tJyNRTgy&0^BvCp&H9shnXlj
zuWNMT{1z{@<J^4YXbpMB%Oxo{AODR(IVBFw34}RUk_?tdE~~LBc|m!^a#oCTCs9j7
zRdkq^f6GET>PKpgCarQV=LGCub<j_Jn)<D@gsT0Bc1u%Ub4Y7>YvcTA@6{h3P!A5%
zUo3E|&|gMa5x;Ipe>o@T;?o&t(O)f)1*ItWt5vC%T8mESz)Z_mL#?W1s<_su3CU1x
zYvh=8)$#0brL#kZs_n^8ZOKq=$xv<CQ0<VR>KtUKwq&TbWT>`ms0~!FAp^DL2(?2B
zswkwOwq&5TY@l{{r61L+oRajCSI-f6K79p!=nrjG=pol*zTLlyKJ*u-D(LsPnrnOg
z9+x@4>(tzNpX=NY@FEe-7tuZkwqHme&7Jp{4!;KYO4{eZ_I&HA?O$O?f&FiwkLGVN
zq`;i>()<I46qq;CNAp7rDKKA5AI-<fa|+IvpT=umA)M{61@62*a{T0b5^Y}(_A%hs
z0_S(W+WvmXm!H;Z{s{0BfVTi|0{%ScF9GhnH*v~~?cRk8N`cGOOdp-^X~KEKj{-E$
z1kShcn)d;nWxxl5{c_;H0M30<>kI{*-vS>E9QA(%I@g1}^Zegwmu|p)po98B;HW<r
z^lt#2`M~)Rf%e~d&hOMG>MwKAcb?}ve%=T=&hvbS^G`Qv{kuU2_4RW`Zb$TAzc1lr
zH-XM3@V`6oXI=DP0gn3bfDWI_+RqPw^O_dT_4@<%6aClkIZXc#gU&6W(^2jdIQkUe
z`n|HLf0Edn`lo}=3ed>}{uAK6K?nWs4;<@f2<Y4jI>UkUJ-M!*3qc3<Cjv+Pa?s&5
zB3h>s_^H5Wfez}=29EmtJf8#C^KJCe`qu(K4fqn!LH%2Rqy8U2hu5xX{WZY(+NAmY
zpo9ABfTRA?pmPW4JO{id@RvXb_1^-H`X7SM??LBd;HLxs40KTc8{nv)%pH~j*Z-aL
z(e-dV@H2p?3+D~$_W+Liy+CIr=<xdj4xH~#>7(<_10B>K3>@`GfX*L4hu<S`V11iD
zT7Mkqp#C)As6P{Q?gAbD)mILz|1<h%{Rrrw{?))y|2ok5Bj_vxekSl6K?n8k1djTD
z2Ax%)vljSSz#j!2)NcWf`p<*T-JtUd@J!%uf)47x4;=OPg3g~n=K%0t!1;Xv2i~B*
z^UDrSd##~3-CmXS+?4~jBcCGo%7f8cgtJZ~eYE}W4g06$yt>EW-Nlb5U3A_s?AJ)X
zdw}CS?K9vwPx}@)&eLcKjtcAs&eKwX<2>yr!nqzW-!p(?zUKnRe9r@Y%(nnI<~tE|
zxC)&0NtQJo_}RpD{nr4;d}j+c%exlrxv6w}tpoc%gZ>7v=YFE^Q?>)Y2l!jS?*-1!
zT{-A}rStt7_%#&2Mds7r3upiDqmM3cGCjBCz&e<38gSOp_C>(&C$9Y;Bb@bl?WMM#
z3ie$8n(qdV^>zR_=6eYEgP?P=^lQ#{E$|+~&2nV{$NC%wob!F1J{-youKlkBj{aOD
zoc;MP`e^&>z#i-QCgA9gUT@9*vp;&B;EQ0-T<gCE9R2xBIQ#Pu*#8sk(Vq^oWQ6lY
ze@+L^`ReyY1qRpt6ahznCJASM9tMA=f<5|E4IKSh2%P<K)-}@lVS{Ub{s<iXc>*}r
z=QF_10Y6_C&i-Eu{2kzr0RIehxSy|*@%+?OI-tNhc;A{Ooa;Y}KHC3W;Jtz813wq|
z1;Bp}d<5`5z()hm2Co00m;Gr1?h||VAIITKK?m)x0G<On3qa>F;MamZ>Z|~M9@yXE
zqO%I@QRh+MxuBzd9*&N=T<Fgxut%L&f#-qFUxk}G?}9z*d<L8zbH&^JOBbDQ!5($G
z9#4rWaC`NmkM2L+gq!|kfIaH;2F~ZY*2!_v836XEGXnSk(7DJ(XB^n0jt}@i(5Z6K
z34uN8EC9}b8=}kA=%TX(>`~_q;Q63)my6Dy!5($i11|ubjV?ORf<5Z|6*xcJ)aBag
zqVplxqt2JW&j+1DE;?#5M|S4;(5)*$;k*_^`*VtLvmc%g_UKOz@LzyVKNp=sut%MX
zfDZwkG8dg+f&B$w?*oqdwV<;E?4vIBjlhe*{x_h5<Iiuw9`n5mcroa#cF}nN>``YU
z@S&iy#YN`@ut%Mpz=wg(`z|_rz#erD0WSfa?_G2{@y!DTvmc%Ud^pAHe%M2}*$>YI
zd-SIt@DZSs@1k=7*rQGv@R6W1(M4w}*rQG@a4+bDU3BJwJ?i`h_$bg>=Av^m*rU#B
z;1`0<eJ(l=gFWhO0e%taJm;db9qdu(ec+{_v)e`I6R<~}?}3j7osP1;(Cj~{!p(lz
z1Na!wv0ZdZ!Tx@-*Y{bM0OxtE=9dBI_g}i7&jrqFRy3b4oc(_Scns`OXBF`EV1F;@
ztOouN*rWfS0KW$84*|z~J9x;50@vqQ`e^^tfMa`Q2<LJw0sFIoF92QwI#{kTV2|ax
z1ne>2S-{IczYcV8Jii=xGuZzLbg<sm0_XbC?fx=wtj~9VV|{)A{FjjLr@*mX2ZfvU
zc?9gypDz3|i~_e8`u`K)=ua{5ag;{4uNU}u;A4SL06qmc|BbEInJ%3D=e3oZ2Y_D*
zJOmt{Bh3c=%fWsD*#8CiwP4Tn!2kOJhnvCvaj;(r9P_;wbg<sm0mt{(n?Pp+=)d4%
z|1NNBuRXxAKO6x4jiCRni+$H_WJH1c!zB9X`tK&3+Xc^y46w)Y_6ClAmIKFnxC}V@
ze}!<fKi>+x1^iqMI_!@=?;ZsEO<=zj>@nXRz<&kye*+z?huvV0<vIu)^*iuOTnc8n
zGK6#eqy3pK_QSz`GNsY&^-JKWGsQ*cQn0@mbY=rbodqsBOI+;l0DcMR-vv5YpZ5W8
z2KyI4XA1CFf%BuX?)2gCmcfhU`RWJ2H<O*Nw~vK$yDS3!1@K1T-+|5+;Hmr)odWAD
zrjOP?RXDdVPmMLVfu8`p4{)sifuN7q%_879E{y~oJ|Fe?b}`sL1Nk<AJ=V{1uzwcp
zZwC7{!2b*!bslihc?9gYg3eapsPm$W&Z}U*4RqcGj`i>baMV8t`p<!m?qJ-GQ-Pl(
z9gq1m;HLq99{Ab7Um&jAWgu|$=R)Az&%dM(hbh1p(y`W=<-)IU;Y}|5HWz-c3x5*$
zugM?X&$j}{{``t?K3}juzX|+N(AfnX`}sc5$9nr5?6DmWf&Gi1pTvtuC~!NL(?|P%
z5^$WKX9AxN_Ibc7fae3B0lWw}>Wl>L1N$++{lG5)j`i#pZr1ZG7yB!L2S9%z=-{~R
z{1~=VZ(KjRAFcqMYl!pM$6*)f{5Kuzdj1F4V>=#yib|&1<t5tH`aOWZ4E*Q7w*x;P
z_$$CK1pX@Ui-F$*ybAcez~=(5B(BS~2spOOQsHL1+yMMh(76LR_WxC&{~-ADhzoxS
zIM(xPppWDHPOyIs{5b#|b-s4d`3~$~2c466@e>6;?@%X0IM)yQ^E0r=`DZq8)ENLe
zi%3hi%MjprefNUS8(=>X>=y%{3HIE_wSEvdUf(YVj{RW~=;L^}6gal~O`!88<a;;p
znZO?booj(V3HDXMUjbeXd<XDLfqw)X%k`;nv!DMH>}P?_Vc<2uJ7&-U1@0%c^wIU~
z0gm~eD%{NX4B&WvWdX-}&INtEjt>To{m=`X=OOxjWG3iie+z;=-bX9|j{WLd;MlKj
z0FL#rLbzEEcLB%pJ_9;9uh<U!1uB;whu;Ri2KYz7d0waOKLx%6_(9-r0q<}s9Z=wQ
zf15s9=LF#Q1J3}C<NTSxaXilkj@OHP;5e=h1HKOO9RnQa4-<jENcOtlP6f{6sxDV0
za2_W$uLI8gO7km#^L1AL-b~DeF9(j-&D((UxTp2+29DR?2Z7^tb3JfAFSO1xz_$Q@
zNjUe<r|F~X=MAvu_SNUn-@tw;*nb4}Hv-=aoa<)^cO(i2fZs&NI^RRU**^X|C8u#J
zryp{AX@Amyb3f6%Cvd!uoGYBm`wo3{zC*yC^VNJf@F4I~;AOxk1IO_^2pq@rdf+&o
zF9eR`Nt1AMJh>k1XMsPr0>|-uC2$<i?*oqcJ|^7E_YJV;^HJB^dtm<r@O@y<^{mgA
zFM)@^KB)&CP~dKZ`rUxzeC90RJ1LF!r#Emue>E=vz8-iHaGZCI23`mD6M$oXs1?rj
z_BXJPfIa%N5IFkZ1Um16&TqjUb?yNB_rQKN*kgGg0?zGnqdd3S1RVRr3!uLQ^j`;i
z)PD~+mTNcgU7)ic^xp^mHQ1wmhn{3af%^&aF2J!JPXUhO+gZSOkd_`lvw^<_d=T)r
zfe!_a{)_^?lk9aroC16o@M_`Qe^vvp1AFxUYT(#j%R%Qc(76-%6~O-lI+p=|2smHA
z^mzLzaGtm5avcW!1;CFxoen5)d!e5hz{3=$bxs$~^??4L104O&1|2*v1_8g9w6y=l
zpz}E7I~wfK&tC(NfV~fN(9bI1AAo%jbkLu<VE-i8F9dt^=X&7i&n=*X{@e~6{dpL4
zc0#`Efn$H$1|0kO%fM0RP2jsg|2^SmKi>`ZnC}<BvEI%&gG4ECzruca4)CiePPcm@
z@Lk09`920XUU#a2W4V?IXa7H>kM6g(fPEDBoxpLvdN1&LuzwQxY~Z_qqfY8i$%q2i
z1NwQk3(pnK{=Y{b?f(d{$9f(Ed=A)81U?sd0QfxMmjTEAe1&lK2d^*Jy4c?e_Snxi
zf<3n58(@$3<L`q#&L@wv$q@?d&qwsp^^<1P0R`sG^wFFd2d;nqe3-&Vjva3`Khl|x
z62E(=c{j1|>O{xw_4?^_gHP+?fbIsrLG&{W-dXPRdm8-iQygF${C8rXY4E>@eU`!R
z68k;||C8`sgWn^(zri06o^SAngbz0OW5SCJzFv5V!8Zu^8r&o0Dm8e<$qwK-9tYiC
z&x_6k2gmtK!Y3R2o0A+c)!=ia0jC?hi|F_b{;u#UgMT2r#^Ae!*BSg{;ZcL@=SOo5
zen9N6H27D-7a05-;foCZz3}S{-a$I-5`%XZzTDuc!f!PA6shMG20u~kZ#TFt^}o{K
zr-}V4gFh(tYYcv-*xzUHQsHY2ev0TkV(=WXUuW>X!k;quK;bP0FBHDT;BQL(Z!`E%
zv47FvBZO}^_+hE%*A4y$(cfY4RdOEfH25UZ-(~Pignwl4)lzSJ3_e5b_ZhrW_<n<5
zD*Q`>>-T2|4ZcS5J#6r&gws!qsi4P)>B5r@exvXdgKrbw)!;7*PdE5>;oS}Xy6_Bx
z?-1V8;5&ue2G{TZ^xxU&a%~a&EW>_}@ID6D?+f+sRBE05V&C7eUnSi69rDC@dr<gb
z!~U>vz3xEkw+JsW?7PS~?=|>5J^mZq7OsC!Q|m7jKEbfh7e3kGG2v4U-Yk5&!G9~<
zZ}6Lh<L^|kPg8~0820)-R-M8BAofv%>-oT3ga29VuQd1q;R_6|->WS$_@iQfoxz_F
zuKylU*Yg9y<+QUnSu%ZkbaY6HL};%6&R<S%W&d}v;f*c(Y`<Ifd6Omk%>N<#yvdb)
z=6cLwln=aB_Ia~X_Sv5O<lq10u*TWM{U0oR9q^B3pEoVC&pLZ$pEuiNpSd3E80qnu
z`6mn{?2s+<Pc@{i9uJwbpQYl@L9x;NVqMP+DDdVpwdZ{4af!Ls<QP5vF#nu^1RHb?
zXh>T<o^bRRz>C11{j3&$^!dxt%)`Q`gZ-DFqt6$P{wHwV&zXM(d^zZ_{#ByC!r=Tn
z$Q;%H|Jp(7do6HXw;Zt!_#v=w0sb!VZNR?)Zp*$tUv$)9gUfJeO*Z%hIoADC>*(Ks
ze%-LwzvF9O7xeGZn&Ykh-C14FT3`QO>=r4P=KA+pdorM)x&Hl5D<?9qmVQKyw)8Vs
zYFnlE!hxEKoM>Qf)XJe>Q>w6XX3&39oJ0SeDIKtR9{tqdQHS))ZYqsqA_vNvaNN`v
ziL#+1LkIi|g7i~`{Ci?n4*jQufgJjeud>xouF3JtV0{k#R2TiYS9G4J2CB*{=|7PU
zIHmeO|F%hjt4?nB;(|r8ccL9fYDcO$4*9Vv_v@lKaduk9Ios>bceW?ehSR1vhw<Nm
zK3lkSe~GoMadv$7s2wx^J`~HEX8P&2lk5?lp>t>nmLvMlscDZj`^Jg0qtoAJ2(v8*
zy)BY`Grg6`R@Co|{Pm$u&*u&AOE={tDkO8kDBP$GokLaEva2>@>AS7^_wq|N3e)BO
zp6mZO`sn;KIFdpIeK-rFnAXZ{j<qb?h_Dq+d`+d@w(?(di(``^={bLP#ms*eG0vak
zF|^19Gan)C>-2g%msnfrRl`zTx{+QV>+NDmFWJSd)45|%kY;cW=$rH6pts9lUq#5*
zU`daR_jNkW@1z}W7qk6~B)zQ-Wvk1r@#D0^nr8axlHU1$Ftl}|{xo@rcG_ycmftxR
z`^AvkhwDt|ulrA1d2^C={rH=<avygR?O#qGGd=eSql*!t-*vx8ZLG0eIS|bH>qc>H
RrC)oSlV}s9<y|xV{{TpgDkA^@

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/dss.ddl b/data/ssb/dbgen/dss.ddl
new file mode 100644
index 0000000..2fe3e70
--- /dev/null
+++ b/data/ssb/dbgen/dss.ddl
@@ -0,0 +1,70 @@
+-- Sccsid:     @(#)dss.ddl	2.1.8.1
+CREATE TABLE TPCD.NATION  ( N_NATIONKEY  INTEGER NOT NULL,
+                            N_NAME       CHAR(25) NOT NULL,
+                            N_REGIONKEY  INTEGER NOT NULL,
+                            N_COMMENT    VARCHAR(152));
+
+CREATE TABLE TPCD.REGION  ( R_REGIONKEY  INTEGER NOT NULL,
+                            R_NAME       CHAR(25) NOT NULL,
+                            R_COMMENT    VARCHAR(152));
+
+CREATE TABLE TPCD.PART  ( P_PARTKEY     INTEGER NOT NULL,
+                          P_NAME        VARCHAR(55) NOT NULL,
+                          P_MFGR        CHAR(25) NOT NULL,
+                          P_BRAND       CHAR(10) NOT NULL,
+                          P_TYPE        VARCHAR(25) NOT NULL,
+                          P_SIZE        INTEGER NOT NULL,
+                          P_CONTAINER   CHAR(10) NOT NULL,
+                          P_RETAILPRICE DECIMAL(15,2) NOT NULL,
+                          P_COMMENT     VARCHAR(23) NOT NULL );
+
+CREATE TABLE TPCD.SUPPLIER ( S_SUPPKEY     INTEGER NOT NULL,
+                             S_NAME        CHAR(25) NOT NULL,
+                             S_ADDRESS     VARCHAR(40) NOT NULL,
+                             S_NATIONKEY   INTEGER NOT NULL,
+                             S_PHONE       CHAR(15) NOT NULL,
+                             S_ACCTBAL     DECIMAL(15,2) NOT NULL,
+                             S_COMMENT     VARCHAR(101) NOT NULL);
+
+CREATE TABLE TPCD.PARTSUPP ( PS_PARTKEY     INTEGER NOT NULL,
+                             PS_SUPPKEY     INTEGER NOT NULL,
+                             PS_AVAILQTY    INTEGER NOT NULL,
+                             PS_SUPPLYCOST  DECIMAL(15,2)  NOT NULL,
+                             PS_COMMENT     VARCHAR(199) NOT NULL );
+
+CREATE TABLE TPCD.CUSTOMER ( C_CUSTKEY     INTEGER NOT NULL,
+                             C_NAME        VARCHAR(25) NOT NULL,
+                             C_ADDRESS     VARCHAR(40) NOT NULL,
+                             C_NATIONKEY   INTEGER NOT NULL,
+                             C_PHONE       CHAR(15) NOT NULL,
+                             C_ACCTBAL     DECIMAL(15,2)   NOT NULL,
+                             C_MKTSEGMENT  CHAR(10) NOT NULL,
+                             C_COMMENT     VARCHAR(117) NOT NULL);
+
+CREATE TABLE TPCD.ORDERS  ( O_ORDERKEY       INTEGER NOT NULL,
+                           O_CUSTKEY        INTEGER NOT NULL,
+                           O_ORDERSTATUS    CHAR(1) NOT NULL,
+                           O_TOTALPRICE     DECIMAL(15,2) NOT NULL,
+                           O_ORDERDATE      DATE NOT NULL,
+                           O_ORDERPRIORITY  CHAR(15) NOT NULL,  -- R
+                           O_CLERK          CHAR(15) NOT NULL,  -- R
+                           O_SHIPPRIORITY   INTEGER NOT NULL,
+                           O_COMMENT        VARCHAR(79) NOT NULL);
+
+CREATE TABLE TPCD.LINEITEM ( L_ORDERKEY    INTEGER NOT NULL,
+                             L_PARTKEY     INTEGER NOT NULL,
+                             L_SUPPKEY     INTEGER NOT NULL,
+                             L_LINENUMBER  INTEGER NOT NULL,
+                             L_QUANTITY    DECIMAL(15,2) NOT NULL,
+                             L_EXTENDEDPRICE  DECIMAL(15,2) NOT NULL,
+                             L_DISCOUNT    DECIMAL(15,2) NOT NULL,
+                             L_TAX         DECIMAL(15,2) NOT NULL,
+                             L_RETURNFLAG  CHAR(1) NOT NULL,
+                             L_LINESTATUS  CHAR(1) NOT NULL,
+                             L_SHIPDATE    DATE NOT NULL,
+                             L_COMMITDATE  DATE NOT NULL,
+                             L_RECEIPTDATE DATE NOT NULL,
+                             L_SHIPINSTRUCT CHAR(25) NOT NULL,  -- R
+                             L_SHIPMODE     CHAR(10) NOT NULL,  -- R
+                             L_COMMENT      VARCHAR(44) NOT NULL);
+
diff --git a/data/ssb/dbgen/dss.h b/data/ssb/dbgen/dss.h
new file mode 100644
index 0000000..8f78d89
--- /dev/null
+++ b/data/ssb/dbgen/dss.h
@@ -0,0 +1,610 @@
+/*
+ * Sccsid:     @(#)dss.h	2.1.8.5
+ *
+ * general definitions and control information for the DSS code 
+ * generator; if it controls the data set, it's here
+ */
+#ifndef DSS_H
+#define  DSS_H
+
+#ifdef SSBM
+#define NAME			"SSBM (Star Schema Benchmark)"
+#define VERSION           1
+#define RELEASE           0
+#define MODIFICATION      0
+#define PATCH             ""
+
+
+/*global variables*/
+/*SSBM added DATE table*/
+#define  DATE           4
+
+/*SSBM use the lineorder without partsupp and order table*/
+#define  L_SKEY_MIN   1
+#define  L_SKEY_MAX (tdefs[SUPP].base * scale)
+
+#endif
+
+#ifdef TPCH
+#define NAME			"TPC-H"
+#define VERSION           1
+#define RELEASE           3
+#define MODIFICATION      0
+#define PATCH             ""
+#endif
+#ifdef TPCR
+#define NAME			"TPC-R"
+#define VERSION           1
+#define RELEASE           3
+#define MODIFICATION      0
+#define PATCH             ""
+#endif
+#ifndef NAME
+#error Benchmark version must be defined in config.h
+#endif
+#define TPC             "Transaction Processing Performance Council"
+#define C_DATES         "1994 - 2000"
+
+#include "config.h"
+#include "shared.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef SSBM
+#include <math.h>
+#endif
+
+#define  NONE		-1
+#define  PART		0
+#define  PSUPP		1
+#define  SUPP		2
+#define  CUST		3
+#define  ORDER		4
+#define  LINE		5
+#define  ORDER_LINE     6
+#define  PART_PSUPP     7
+#define  NATION		8
+#define  REGION		9
+#define  UPDATE		10
+#define  MAX_TABLE	11
+#define  ONE_STREAM	1
+#define  ADD_AT_END	2
+
+#ifdef MAX
+#undef MAX
+#endif
+#ifdef MIN
+#undef MIN
+#endif
+#define MAX(a,b) ((a > b )?a:b)
+#define MIN(A,B)  ( (A) < (B) ? (A) : (B))
+
+#define INTERNAL_ERROR(p)  {fprintf(stderr,"%s", p);abort();}
+#define LN_CNT  4
+static char lnoise[4] = {'|', '/', '-', '\\' };
+#define LIFENOISE(n, var)	\
+	if (verbose > 0) fprintf(stderr, "%c\b", lnoise[(var%LN_CNT)])
+
+#define MALLOC_CHECK(var) \
+    if ((var) == NULL) \
+        { \
+        fprintf(stderr, "Malloc failed at %s:%d\n",  \
+            __FILE__, __LINE__); \
+        exit(1);\
+        }
+#define OPEN_CHECK(var, path) \
+    if ((var) == NULL) \
+        { \
+        fprintf(stderr, "Open failed for %s at %s:%d\n",  \
+            path, __FILE__, __LINE__); \
+        exit(1);\
+        }
+#ifndef MAX_CHILDREN
+#define MAX_CHILDREN    1000
+#endif
+
+/*
+ * macros that control sparse keys
+ *
+ * refer to Porting.Notes for a complete explanation
+ */
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG   32
+#define MAX_LONG        0x7FFFFFFF
+#endif /* BITS_PER_LONG */
+#define SPARSE_BITS      2
+#define SPARSE_KEEP      3
+#define  MK_SPARSE(key, seq) \
+         (((((key>>3)<<2)|(seq & 0x0003))<<3)|(key & 0x0007))
+
+#define RANDOM(tgt, lower, upper, stream)	dss_random(&tgt, lower, upper, stream)
+#ifdef SSBM
+typedef struct{
+  char * name;
+  int start_day;
+  int start_month;
+  int end_day;
+  int end_month;
+} season;
+typedef struct {
+  char * name;
+  int month;
+  int day;
+} holiday;
+
+
+#endif	
+     
+
+typedef struct
+{
+   long      weight;
+   char     *text;
+}         set_member;
+
+typedef struct
+{
+   int      count;
+   int      max;
+   set_member *list;
+   long *permute;
+}         distribution;
+
+/*
+ * some handy access functions 
+ */
+#define DIST_SIZE(d)		d->count
+#define DIST_MEMBER(d, i)	((set_member *)((d)->list + i))->text
+
+typedef struct
+{
+   char     *name;
+   char     *comment;
+   long      base;
+   int       (*header) ();
+   int       (*loader[2]) ();
+   long      (*gen_seed)();
+   int       (*verify) ();
+   int       child;
+   unsigned long vtotal;
+}         tdef;
+
+typedef struct SEED_T {
+	long table;
+	long value;
+	long usage;
+	long boundary;
+	} seed_t;
+
+
+#if defined(__STDC__)
+#define PROTO(s) s
+#else
+#define PROTO(s) ()
+#endif
+
+/* bm_utils.c */
+char	*env_config PROTO((char *var, char *dflt));
+long	yes_no PROTO((char *prompt));
+int     a_rnd PROTO((int min, int max, int column, char *dest));
+int     tx_rnd PROTO((long min, long max, long column, char *tgt));
+long	julian PROTO((long date));
+long	unjulian PROTO((long date));
+FILE	*tbl_open PROTO((int tbl, char *mode));
+long	dssncasecmp PROTO((char *s1, char *s2, int n));
+long	dsscasecmp PROTO((char *s1, char *s2));
+int		pick_str PROTO((distribution * s, int c, char *target));
+void	agg_str PROTO((distribution *set, long count, long col, char *dest));
+void	read_dist PROTO((char *path, char *name, distribution * target));
+void	embed_str PROTO((distribution *d, int min, int max, int stream, char *dest));
+#ifndef STDLIB_HAS_GETOPT
+int		getopt PROTO((int arg_cnt, char **arg_vect, char *oprions));
+#endif /* STDLIB_HAS_GETOPT */
+long	set_state PROTO((int t, long scale, long procs, long step, long *e));
+
+/* rnd.c */
+long	NextRand PROTO((long nSeed));
+long	UnifInt PROTO((long nLow, long nHigh, long nStream));
+double	UnifReal PROTO((double dLow, double dHigh, long nStream));
+double	Exponential PROTO((double dMean, long nStream));
+void	dss_random(long *tgt, long min, long max, long seed);
+void	row_start(int t);
+void	row_stop(int t);
+void	dump_seeds(int t);
+
+/* text.c */
+#define MAX_GRAMMAR_LEN	12	/* max length of grammar component */
+#define MAX_SENT_LEN	256 /* max length of populated sentence */
+#define RNG_PER_SENT	27	/* max number of RNG calls per sentence */
+
+int		dbg_text PROTO((char * t, int min, int max, int s));
+
+#ifdef DECLARER
+#define EXTERN
+#else
+#define EXTERN extern
+#endif            /* DECLARER */
+
+
+EXTERN distribution nations;
+EXTERN distribution nations2;
+EXTERN distribution regions;
+EXTERN distribution o_priority_set;
+EXTERN distribution l_instruct_set;
+EXTERN distribution l_smode_set;
+EXTERN distribution l_category_set;
+EXTERN distribution l_rflag_set;
+EXTERN distribution c_mseg_set;
+EXTERN distribution colors;
+EXTERN distribution p_types_set;
+EXTERN distribution p_cntr_set;
+
+/* distributions that control text generation */
+EXTERN distribution articles;
+EXTERN distribution nouns;
+EXTERN distribution adjectives;
+EXTERN distribution adverbs;
+EXTERN distribution prepositions;
+EXTERN distribution verbs;
+EXTERN distribution terminators;
+EXTERN distribution auxillaries;
+EXTERN distribution np;
+EXTERN distribution vp;
+EXTERN distribution grammar;
+
+
+EXTERN long scale;
+EXTERN int refresh;
+EXTERN int resume;
+EXTERN long verbose;
+EXTERN long force;
+EXTERN long header;
+EXTERN long columnar;
+EXTERN long direct;
+EXTERN long updates;
+EXTERN long table;
+EXTERN long children;
+EXTERN long fnames;
+EXTERN int  gen_sql;
+EXTERN int  gen_rng;
+EXTERN char *db_name;
+EXTERN int  step;
+EXTERN int	set_seeds;
+EXTERN int  validate;
+EXTERN char *d_path;
+
+/* added for segmented updates */
+EXTERN int insert_segments;
+EXTERN int delete_segments;
+EXTERN int insert_orders_segment;
+EXTERN int insert_lineitem_segment;
+EXTERN int delete_segment;
+ 
+
+#ifndef DECLARER
+extern tdef tdefs[];
+
+#endif            /* DECLARER */
+
+
+/*****************************************************************
+ ** table level defines use the following naming convention: t_ccc_xxx
+ ** with: t, a table identifier
+ **       ccc, a column identifier
+ **       xxx, a limit type
+ ****************************************************************
+ */
+
+/*
+ * defines which control the parts table
+ */
+#define  P_SIZE       126
+#ifdef SSBM
+#define  P_NAME_SCL   3     /*5 change to 3 according to the new schema*/
+#else
+#define  P_NAME_SCL   5
+#endif
+#define  P_MFG_TAG    "Manufacturer#"
+#define  P_MFG_FMT     "%s%01d"
+#define  P_MFG_MIN     1
+#define  P_MFG_MAX     5
+#define  P_BRND_TAG   "Brand#"
+#define  P_BRND_FMT   "%s%02d"
+#define  P_BRND_MIN     1
+
+/*#ifdef SSBM
+#define  P_BRND_MAX     5
+#else*/
+#define  P_BRND_MAX 40
+/*#endif*/
+
+#define  P_SIZE_MIN    1
+#define  P_SIZE_MAX    50
+#define  P_MCST_MIN    100
+#define  P_MCST_MAX    99900
+#define  P_MCST_SCL    100.0
+#define  P_RCST_MIN    90000
+#define  P_RCST_MAX    200000
+#define  P_RCST_SCL    100.0
+/*
+ * defines which control the suppliers table
+ */
+#define  S_SIZE     145
+#define  S_NAME_TAG "Supplier#"
+#define  S_NAME_FMT "%s%09ld"
+#define  S_ABAL_MIN   -99999
+#define  S_ABAL_MAX    999999
+#define  S_CMNT_MAX    101      
+#define  S_CMNT_BBB    10       /* number of BBB comments/SF */
+#define  BBB_DEADBEATS 50       /* % that are complaints */
+#define  BBB_BASE  "Customer "
+#define  BBB_COMPLAIN  "Complaints"
+#define  BBB_COMMEND   "Recommends"
+#define  BBB_CMNT_LEN  19
+#define  BBB_BASE_LEN  9
+#define  BBB_TYPE_LEN  10
+
+/*
+ * defines which control the partsupp table
+ */
+#define  PS_SIZE      145
+#define  PS_SKEY_MIN  0
+#define  PS_SKEY_MAX  ((tdefs[SUPP].base - 1) * scale)
+#define  PS_SCST_MIN  100
+#define  PS_SCST_MAX  100000
+#define  PS_QTY_MIN   1
+#define  PS_QTY_MAX   9999
+/*
+ * defines which control the customers table
+ */
+#define  C_SIZE       165
+#define  C_NAME_TAG   "Customer#"
+#define  C_NAME_FMT   "%s%09d"
+#define  C_MSEG_MAX    5
+#define  C_ABAL_MIN   -99999
+#define  C_ABAL_MAX    999999
+/*
+ * defines which control the order table
+ */
+#define  O_SIZE          109
+#define  O_CKEY_MIN      1
+#define  O_CKEY_MAX      (long)(tdefs[CUST].base * scale)
+#define  O_ODATE_MIN     STARTDATE
+#define  O_ODATE_MAX     (STARTDATE + TOTDATE - \
+                         (L_SDTE_MAX + L_RDTE_MAX) - 1)
+#define  O_CLRK_TAG      "Clerk#"
+#define  O_CLRK_FMT      "%s%09d"
+#define  O_CLRK_SCL      1000
+#define  O_LCNT_MIN      1
+#define  O_LCNT_MAX      7
+
+/*
+ * defines which control the lineitem table
+ */
+#define  L_SIZE       144L
+#define  L_QTY_MIN    1
+#define  L_QTY_MAX    50
+#define  L_TAX_MIN    0
+#define  L_TAX_MAX    8
+#define  L_DCNT_MIN   0
+#define  L_DCNT_MAX   10
+#define  L_PKEY_MIN   1
+
+#ifdef SSBM
+/*part table log based*/
+#define  L_PKEY_MAX   (tdefs[PART].base * (floor(log((double)scale))+1))
+#else
+#define  L_PKEY_MAX   (tdefs[PART].base * scale)
+#endif
+
+#define  L_SDTE_MIN   1
+#define  L_SDTE_MAX   121
+#define  L_CDTE_MIN   30
+#define  L_CDTE_MAX   90
+#define  L_RDTE_MIN   1
+#define  L_RDTE_MAX   30
+/*
+ * defines which control the time table
+ */
+#define  T_SIZE       30
+#define  T_START_DAY  3     /* wednesday ? */
+#define  LEAP(y)  ((!(y % 4) && (y % 100))?1:0)
+
+/*******************************************************************
+ *******************************************************************
+ ***
+ *** general or inter table defines
+ ***
+ *******************************************************************
+ *******************************************************************/
+#define  SUPP_PER_PART 4
+#define  ORDERS_PER_CUST 10 /* sync this with CUST_MORTALITY */
+#define  CUST_MORTALITY 3  /* portion with have no orders */
+#define  NATIONS_MAX  90 /* limited by country codes in phone numbers */
+#define  PHONE_FMT    "%02d-%03d-%03d-%04d"
+#define  STARTDATE    92001
+#define  CURRENTDATE  95168
+#define  ENDDATE      98365
+#define  TOTDATE      2557
+#define  UPD_PCT      10
+#define  MAX_STREAM   47
+#define  V_STR_LOW    0.4
+#define  PENNIES    100 /* for scaled int money arithmetic */
+#define  Q11_FRACTION (double)0.0001
+/*
+ * max and min SF in GB; Larger SF will require changes to the build routines
+ */
+#define  MIN_SCALE      1.0
+#define  MAX_SCALE   1000.0
+/*
+ * beyond this point we need to allow for BCD calculations
+ */
+#define  MAX_32B_SCALE   1000.0
+#define INIT_HUGE(v)	{ \
+			v = (DSS_HUGE *)malloc(sizeof(DSS_HUGE) * HUGE_COUNT); \
+			MALLOC_CHECK(v); \
+			}
+#define FREE_HUGE(v)	free(v)
+#ifdef SUPPORT_64BITS
+#define LONG2HUGE(src, dst)		*dst = (DSS_HUGE)src	
+#define HUGE2LONG(src, dst)		*dst = (long)src
+#define HUGE_SET(src, dst)		*dst = *src	
+#define HUGE_MUL(op1, op2)		*op1 *= op2	
+#define HUGE_DIV(op1, op2)		*op1 /= op2	
+#define HUGE_ADD(op1, op2, dst)	*dst = *op1 + op2	
+#define HUGE_SUB(op1, op2, dst)	*dst = *op1 - op2	
+#define HUGE_MOD(op1, op2)		*op1 % op2	
+#define HUGE_CMP(op1, op2)		(*op1 == *op2)?0:(*op1 < *op2)-1:1
+#else
+#define LONG2HUGE(src, dst)		{*dst = src; *(dst + 1) = 0;}
+#define HUGE2LONG(src, dst)		{ dst=0 ; \
+					bcd2_bin(dst, (src + 1)); \
+					bcd2_bin(dst, src); }
+#define HUGE_SET(src, dst)		{ *dst = *src ; *(dst + 1) = *(src + 1); }
+#define HUGE_MUL(op1,op2)		bcd2_mul(op1, (op1 + 1), op2)
+#define HUGE_DIV(op1,op2)		bcd2_div(op1, (op1 + 1), op2)
+#define HUGE_ADD(op1,op2,d)		{ \
+					HUGE_SET(op1, d); \
+					bcd2_add(d, (d + 1), op2); \
+					}
+#define HUGE_SUB(op1,op2,d)		{ \
+					HUGE_SET(op1, d); \
+					bcd2_sub(d, (d + 1), op2); \
+					}
+#define HUGE_MOD(op1, op2)		bcd2_mod(op1, (op1 + 1), op2)
+#define HUGE_CMP(op1, op2)		(bcd2_cmp(op1, (op1 + 1), op2) == 0)?0:\
+					    ((bcd2_cmp(op1, (op1 + 1), op2) < 0)?-1:1)
+#endif /* SUPPORT_64BITS */
+
+/******** environmental variables and defaults ***************/
+#define  DIST_TAG  "DSS_DIST"		/* environment var to override ... */
+#define  DIST_DFLT "dists.dss"		/* default file to hold distributions */
+#define  PATH_TAG  "DSS_PATH"		/* environment var to override ... */
+#define  PATH_DFLT "."				/* default directory to hold tables */
+#define  CONFIG_TAG  "DSS_CONFIG"	/* environment var to override ... */
+#define  CONFIG_DFLT "."			/* default directory to config files */
+#define  ADHOC_TAG  "DSS_ADHOC"		/* environment var to override ... */
+#define  ADHOC_DFLT "adhoc.dss"		/* default file name for adhoc vars */
+
+/******* output macros ********/
+#ifndef SEPARATOR
+#define SEPARATOR '|' /* field spearator for generated flat files */
+#endif
+/* Data type flags for a single print routine */
+#define DT_STR		0
+#ifndef MVS
+#define DT_VSTR		DT_STR
+#else
+#define DT_VSTR		1
+#endif /* MVS */
+#define DT_INT		2
+#define DT_HUGE		3
+#define DT_KEY		4
+#define DT_MONEY	5
+#define DT_CHR		6
+
+int dbg_print(int dt, FILE *tgt, void *data, int len, int eol);
+#define PR_STR(f, str, len)		dbg_print(DT_STR, f, (void *)str, len, 1)
+#define PR_VSTR(f, str, len) 	dbg_print(DT_VSTR, f, (void *)str, len, 1)
+#define PR_VSTR_LAST(f, str, len) 	dbg_print(DT_VSTR, f, (void *)str, len, 0)
+#define PR_INT(f, str) 			dbg_print(DT_INT, f, (void *)str, 0, 1)
+#define PR_HUGE(f, str) 		dbg_print(DT_HUGE, f, (void *)str, 0, 1)
+#define PR_KEY(f, str) 			dbg_print(DT_KEY, f, (void *)str, 0, -1)
+#define PR_MONEY(f, str) 		dbg_print(DT_MONEY, f, (void *)str, 0, 1)
+#define PR_CHR(f, str)	 		dbg_print(DT_CHR, f, (void *)str, 0, 1)
+#define  PR_STRT(fp)   /* any line prep for a record goes here */
+#define  PR_END(fp)    fprintf(fp, "\n")   /* finish the record here */
+
+#ifdef SSBM
+#define  PR_DATE(tgt, yr, mn, dy)	\
+   sprintf(tgt, "19%02d%02d%02d", yr, mn, dy)
+#else
+#ifdef MDY_DATE
+#define  PR_DATE(tgt, yr, mn, dy)	\
+   sprintf(tgt, "%02d-%02d-19%02d", mn, dy, yr)
+#else
+#define  PR_DATE(tgt, yr, mn, dy)	\
+sprintf(tgt, "19%02d-%02d-%02d", yr, mn, dy)
+#endif /* DATE_FORMAT */
+#endif
+/*
+ * verification macros
+ */
+#define  VRF_STR(t, d) {char *xx = d; while (*xx) tdefs[t].vtotal += *xx++;}
+#define  VRF_INT(t,d)  tdefs[t].vtotal += d
+#ifdef SUPPORT_64BITS
+#define  VRF_HUGE(t,d)	tdefs[t].vtotal = *((long *)&d) + *((long *)(&d + 1))
+#else
+#define VRF_HUGE(t,d)	tdefs[t].vtotal += d[0] + d[1]
+#endif /* SUPPORT_64BITS */
+/* assume float is a 64 bit quantity */
+#define  VRF_MONEY(t,d)	tdefs[t].vtotal = *((long *)&d) + *((long *)(&d + 1))
+#define  VRF_CHR(t,d)	tdefs[t].vtotal += d
+#define  VRF_STRT(t)  
+#define  VRF_END(t)  
+
+/*********** distribuitons currently defined *************/
+#define  UNIFORM   0
+
+/*
+ * seed indexes; used to separate the generation of individual columns
+ */
+#define  P_MFG_SD  0
+#define  P_BRND_SD 1
+#define  P_TYPE_SD 2
+#define  P_SIZE_SD 3
+#define  P_CNTR_SD 4
+#define  P_RCST_SD 5
+#define  PS_QTY_SD 7
+#define  PS_SCST_SD   8
+#define  O_SUPP_SD 10
+#define  O_CLRK_SD 11
+#define  O_ODATE_SD   13
+#define  L_QTY_SD  14
+#define  L_DCNT_SD 15
+#define  L_TAX_SD  16
+#define  L_SHIP_SD 17
+#define  L_SMODE_SD   18
+#define  L_PKEY_SD 19
+#define  L_SKEY_SD 20
+#define  L_SDTE_SD 21
+#define  L_CDTE_SD 22
+#define  L_RDTE_SD 23
+#define  L_RFLG_SD 24
+#define  C_NTRG_SD 27
+#define  C_PHNE_SD 28
+#define  C_ABAL_SD 29
+#define  C_MSEG_SD 30
+#define  S_NTRG_SD 33
+#define  S_PHNE_SD 34
+#define  S_ABAL_SD 35
+#define  P_NAME_SD 37
+#define  O_PRIO_SD 38
+#define  HVAR_SD   39
+#define  O_CKEY_SD 40
+#define  N_CMNT_SD 41
+#define  R_CMNT_SD 42
+#define  O_LCNT_SD 43
+#define  BBB_JNK_SD    44          
+#define  BBB_TYPE_SD   45         
+#define  BBB_CMNT_SD   46         
+#define  BBB_OFFSET_SD 47         
+
+#endif            /* DSS_H */
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/data/ssb/dbgen/dss.ri b/data/ssb/dbgen/dss.ri
new file mode 100644
index 0000000..fb4c002
--- /dev/null
+++ b/data/ssb/dbgen/dss.ri
@@ -0,0 +1,100 @@
+-- Sccsid:     @(#)dss.ri	2.1.8.1
+-- TPCD Benchmark Version 8.0
+
+CONNECT TO TPCD;
+
+--ALTER TABLE TPCD.REGION DROP PRIMARY KEY;
+--ALTER TABLE TPCD.NATION DROP PRIMARY KEY;
+--ALTER TABLE TPCD.PART DROP PRIMARY KEY;
+--ALTER TABLE TPCD.SUPPLIER DROP PRIMARY KEY;
+--ALTER TABLE TPCD.PARTSUPP DROP PRIMARY KEY;
+--ALTER TABLE TPCD.ORDERS DROP PRIMARY KEY;
+--ALTER TABLE TPCD.LINEITEM DROP PRIMARY KEY;
+--ALTER TABLE TPCD.CUSTOMER DROP PRIMARY KEY;
+
+
+-- For table REGION
+ALTER TABLE TPCD.REGION
+ADD PRIMARY KEY (R_REGIONKEY);
+
+-- For table NATION
+ALTER TABLE TPCD.NATION
+ADD PRIMARY KEY (N_NATIONKEY);
+
+ALTER TABLE TPCD.NATION
+ADD FOREIGN KEY NATION_FK1 (N_REGIONKEY) references TPCD.REGION;
+
+COMMIT WORK;
+
+-- For table PART
+ALTER TABLE TPCD.PART
+ADD PRIMARY KEY (P_PARTKEY);
+
+COMMIT WORK;
+
+-- For table SUPPLIER
+ALTER TABLE TPCD.SUPPLIER
+ADD PRIMARY KEY (S_SUPPKEY);
+
+ALTER TABLE TPCD.SUPPLIER
+ADD FOREIGN KEY SUPPLIER_FK1 (S_NATIONKEY) references TPCD.NATION;
+
+COMMIT WORK;
+
+-- For table PARTSUPP
+ALTER TABLE TPCD.PARTSUPP
+ADD PRIMARY KEY (PS_PARTKEY,PS_SUPPKEY);
+
+COMMIT WORK;
+
+-- For table CUSTOMER
+ALTER TABLE TPCD.CUSTOMER
+ADD PRIMARY KEY (C_CUSTKEY);
+
+ALTER TABLE TPCD.CUSTOMER
+ADD FOREIGN KEY CUSTOMER_FK1 (C_NATIONKEY) references TPCD.NATION;
+
+COMMIT WORK;
+
+-- For table LINEITEM
+ALTER TABLE TPCD.LINEITEM
+ADD PRIMARY KEY (L_ORDERKEY,L_LINENUMBER);
+
+COMMIT WORK;
+
+-- For table ORDERS
+ALTER TABLE TPCD.ORDERS
+ADD PRIMARY KEY (O_ORDERKEY);
+
+COMMIT WORK;
+
+-- For table PARTSUPP
+ALTER TABLE TPCD.PARTSUPP
+ADD FOREIGN KEY PARTSUPP_FK1 (PS_SUPPKEY) references TPCD.SUPPLIER;
+
+COMMIT WORK;
+
+ALTER TABLE TPCD.PARTSUPP
+ADD FOREIGN KEY PARTSUPP_FK2 (PS_PARTKEY) references TPCD.PART;
+
+COMMIT WORK;
+
+-- For table ORDERS
+ALTER TABLE TPCD.ORDERS
+ADD FOREIGN KEY ORDERS_FK1 (O_CUSTKEY) references TPCD.CUSTOMER;
+
+COMMIT WORK;
+
+-- For table LINEITEM
+ALTER TABLE TPCD.LINEITEM
+ADD FOREIGN KEY LINEITEM_FK1 (L_ORDERKEY)  references TPCD.ORDERS;
+
+COMMIT WORK;
+
+ALTER TABLE TPCD.LINEITEM
+ADD FOREIGN KEY LINEITEM_FK2 (L_PARTKEY,L_SUPPKEY) references 
+        TPCD.PARTSUPP;
+
+COMMIT WORK;
+
+
diff --git a/data/ssb/dbgen/dsstypes.h b/data/ssb/dbgen/dsstypes.h
new file mode 100644
index 0000000..ce2b7d8
--- /dev/null
+++ b/data/ssb/dbgen/dsstypes.h
@@ -0,0 +1,312 @@
+ /* 
+ * Sccsid:     @(#)dsstypes.h	2.1.8.1
+ *
+ * general definitions and control information for the DSS data types
+ * and function prototypes
+ * Modified for SSBM prototype
+ */
+
+/*
+ * typedefs
+ */
+#ifdef SSBM
+typedef struct
+{
+    long            custkey;
+    char            name[C_NAME_LEN + 1];
+    int             nlen;
+    char            address[C_ADDR_MAX + 1];
+    int             alen;
+    char            city[CITY_FIX+1];
+    int             nation_key;
+    char            nation_name[C_NATION_NAME_LEN+1];
+    int             region_key;
+    char            region_name[C_REGION_NAME_LEN+1];
+    char            phone[PHONE_LEN + 1];
+    char            mktsegment[MAXAGG_LEN + 1];
+}               customer_t;
+#else
+typedef struct
+{
+    long            custkey;
+    char            name[C_NAME_LEN + 1];
+    char            address[C_ADDR_MAX + 1];
+    int             alen;
+    long            nation_code;
+    char            phone[PHONE_LEN + 1];
+    long            acctbal;
+    char            mktsegment[MAXAGG_LEN + 1];
+    char            comment[C_CMNT_MAX + 1];
+    int             clen;
+}               customer_t;
+#endif
+
+/* customers.c */
+long mk_cust   PROTO((long n_cust, customer_t * c));
+int pr_cust    PROTO((customer_t * c, int mode));
+int ld_cust    PROTO((customer_t * c, int mode));
+
+#ifdef SSBM
+
+typedef struct
+{
+    DSS_HUGE	    *okey;  /*for clustering line items*/
+    int             linenumber; /*integer, constrain to max of 7*/
+    long            custkey;
+    long            partkey;
+    long            suppkey;
+    char            orderdate[DATE_LEN];
+    char            opriority[MAXAGG_LEN + 1];
+    long            ship_priority;
+    long             quantity;
+    long           extended_price;
+    long           order_totalprice;
+    long           discount;
+    long           revenue;
+    long           supp_cost;
+    long           tax;
+    char            commit_date[DATE_LEN] ;
+    char            shipmode[O_SHIP_MODE_LEN + 1];
+}  lineorder_t;
+#else
+typedef struct
+{
+    DSS_HUGE	    *okey; 
+    long            partkey;
+    long            suppkey;
+    long            lcnt;
+    long            quantity;
+    long            eprice;
+    long            discount;
+    long            tax;
+    char            rflag[1];
+    char            lstatus[1];
+    char            cdate[DATE_LEN];
+    char            sdate[DATE_LEN];
+    char            rdate[DATE_LEN];
+    char           shipinstruct[MAXAGG_LEN + 1];
+    char           shipmode[MAXAGG_LEN + 1];
+    char           comment[L_CMNT_MAX + 1];
+    int            clen;
+}               line_t;
+#endif
+
+#ifdef SSBM
+typedef struct
+{
+    DSS_HUGE	    *okey;
+    long            custkey;
+    int             totalprice;
+    char            odate[DATE_LEN];
+    char            opriority[MAXAGG_LEN + 1];
+    char            clerk[O_CLRK_LEN + 1];
+    int             spriority;
+    long            lines;
+    lineorder_t     lineorders[O_LCNT_MAX];
+}   order_t;
+#else
+typedef struct
+{
+    DSS_HUGE	    *okey;
+    long            custkey;
+    char            orderstatus;
+    long            totalprice;
+    char            odate[DATE_LEN];
+    char            opriority[MAXAGG_LEN + 1];
+    char            clerk[O_CLRK_LEN + 1];
+    long            spriority;
+    long            lines;
+    char            comment[O_CMNT_MAX + 1];
+    int            clen;
+    line_t          l[O_LCNT_MAX];
+}               order_t;
+#endif
+
+/* order.c */
+long	mk_order	PROTO((long index, order_t * o, long upd_num));
+int		pr_order	PROTO((order_t * o, int mode));
+int		ld_order	PROTO((order_t * o, int mode));
+void	ez_sparse	PROTO((long index, DSS_HUGE *ok, long seq));
+#ifndef SUPPORT_64BITS
+void	hd_sparse	PROTO((long index, DSS_HUGE *ok, long seq));
+#endif
+
+#ifdef SSBM
+/*SSBM removes the part supplier table*/       
+#else
+typedef struct
+{
+    long            partkey;
+    long            suppkey;
+    long            qty;
+    long            scost;
+    char           comment[PS_CMNT_MAX + 1];
+    int            clen;
+}               partsupp_t;
+#endif
+
+#ifdef SSBM
+typedef struct
+{
+    long           partkey;
+    char           name[P_NAME_LEN + 1];
+    int            nlen;
+    char           mfgr[P_MFG_LEN + 1];
+    char           category[P_CAT_LEN + 1];
+    char           brand[P_BRND_LEN + 1];
+    char           color[P_COLOR_MAX + 1];
+    int            clen;
+    char           type[P_TYPE_MAX + 1];
+    int            tlen;
+    long            size;
+    char           container[P_CNTR_LEN + 1];
+}               part_t;
+#else
+typedef struct
+{
+    long           partkey;
+    char           name[P_NAME_LEN + 1];
+    int            nlen;
+    char           mfgr[P_MFG_LEN + 1];
+    char           brand[P_BRND_LEN + 1];
+    char           type[P_TYPE_LEN + 1];
+    int            tlen;
+    long           size;
+    char           container[P_CNTR_LEN + 1];
+    long           retailprice;
+    char           comment[P_CMNT_MAX + 1];
+    int            clen;
+    partsupp_t     s[SUPP_PER_PART];
+}               part_t;
+#endif
+
+/* parts.c */
+long mk_part   PROTO((long index, part_t * p));
+int pr_part    PROTO((part_t * part, int mode));
+int ld_part    PROTO((part_t * part, int mode));
+
+#ifdef SSBM
+typedef struct
+{
+    long            suppkey;
+    char            name[S_NAME_LEN + 1];
+    char            address[S_ADDR_MAX + 1];
+    int             alen; 
+    char            city[CITY_FIX +1];
+    int             nation_key;
+    char            nation_name[S_NATION_NAME_LEN+1];
+    int             region_key;
+    char            region_name[S_REGION_NAME_LEN+1];
+    char            phone[PHONE_LEN + 1];
+}               supplier_t;
+#else
+typedef struct
+{
+    long            suppkey;
+    char            name[S_NAME_LEN + 1];
+    char            address[S_ADDR_MAX + 1];
+    int             alen;
+    long            nation_code;
+    char            phone[PHONE_LEN + 1];
+    long            acctbal;
+    char            comment[S_CMNT_MAX + 1];
+    int             clen;
+}               supplier_t;
+#endif
+
+/* supplier.c */
+long mk_supp   PROTO((long index, supplier_t * s));
+int pr_supp    PROTO((supplier_t * supp, int mode));
+int ld_supp    PROTO((supplier_t * supp, int mode));
+
+#ifdef SSBM
+/*todo: add new date table*/
+
+typedef struct
+{
+   long            datekey;
+   char            date[D_DATE_LEN+1];
+   char            dayofweek[D_DAYWEEK_LEN+1] ;
+   char            month[D_MONTH_LEN+1];
+   int             year;
+   int             yearmonthnum;
+   char            yearmonth[D_YEARMONTH_LEN+1];
+   int             daynuminweek;
+   int             daynuminmonth;
+   int             daynuminyear;
+   int             monthnuminyear;
+   int             weeknuminyear;
+   char            sellingseason[D_SEASON_LEN + 1];
+   int             slen;
+   char            lastdayinweekfl[2];
+   char            lastdayinmonthfl[2];
+   char            holidayfl[2];
+   char            weekdayfl[2];
+}      date_t;
+
+/* date.c */
+
+long mk_date   PROTO((long index, date_t * d));
+int pr_date    PROTO((date_t * date, int mode));
+int ld_date    PROTO((date_t * date, int mode));
+
+#endif
+
+typedef struct
+{
+    long            timekey;
+    char            alpha[DATE_LEN];
+    long            year;
+    long            month;
+    long            week;
+    long            day;
+} dss_time_t;               
+
+/* time.c */
+long mk_time   PROTO((long index, dss_time_t * t));
+
+
+
+/*
+ * this assumes that N_CMNT_LEN >= R_CMNT_LEN 
+ */
+typedef struct
+{
+    long            code;
+    char            *text;
+    long            join;
+    char            comment[N_CMNT_MAX + 1];
+    int             clen;
+}               code_t;
+
+/* code table */
+int mk_nation   PROTO((long i, code_t * c));
+int pr_nation    PROTO((code_t * c, int mode));
+int ld_nation    PROTO((code_t * c, int mode));
+int mk_region   PROTO((long i, code_t * c));
+int pr_region    PROTO((code_t * c, int mode));
+int ld_region    PROTO((code_t * c, int mode));
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/data/ssb/dbgen/history.html b/data/ssb/dbgen/history.html
new file mode 100644
index 0000000..fb6ed32
--- /dev/null
+++ b/data/ssb/dbgen/history.html
@@ -0,0 +1,586 @@
+<!-- @(#)history.html	2.1.8.5 -->
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
+
+<HTML>
+
+<HEAD>
+<TITLE>DBGEN Release Notes</TITLE>
+<META NAME="GENERATOR" CONTENT="Arachnophilia 3.4">
+<META NAME="FORMATTER" CONTENT="Arachnophilia 3.4">
+</HEAD>
+
+<BODY BACKGROUND="" BGCOLOR="#ffffdd" TEXT="#000000" LINK="#0000ff" VLINK="#800080" ALINK="#ff0000">
+<h1>Release notes for DBGEN and QGEN</h1>
+<hr>
+These notes are taken from the History file which is distributed with the TPC-D soft appendix.<p>
+
+<UL>
+<li><A NAME="20000511">Changes as of 5/11/00</A><ul>
+   <li> versions: TPCH 1.3.0, TPCR 1.3.0
+   <li> Corrected update range rollover after 1000 update segements
+   <li> Corrected problem in permute.c causing invalid substitutions in Q16
+   </ul>
+<li><A NAME="19991011">Changes as of 10/11/99</A><ul>
+   <li> versions: TPCH 1.2.0a, TPCR 1.1.0a
+   <li> Corrected range setting of segmented updates that was causing extra file to be generated
+   <li>Porting corrections for DigUnix 
+   </ul>
+<li><A NAME="990830">Changes as of 08/30/99</A><ul>
+   <li> versions: TPCH 1.2.0, TPCR 1.1.0
+   <li> reduced parameter substitution range for Q18
+   <li> added new option to specify location of dists file (-b)
+   <li> added DBGEN option to suppress all output (-q)
+   </ul>
+<li><A NAME="990816">Changes as of 08/16/99</A><ul>
+   <li> versions: TPCH 1.1.0a, TPCR 1.0.1e
+   <li> prevent "reuse" of original data in update files
+   <li> correction to lint target in makefile.suite
+   <li> removal of vestigal l_partkey predicate from 21.sql
+   <li> reorder lineitem/order join in q5
+   <li> removal of table aliases from 2.sql
+   <li> randomize seeding of qgen RNG to close bug 52
+   <li> correct possible round off error in segmented update files
+   <li> corrected soft copy answer set for Q22
+   <li> corrected percision of answer set for Q19
+   </ul>
+<li><a name="990708">Changes as of 07/08/99</a><ul>
+	<li>versions: TPCH 1.1.0, TPCR 1.0.1
+  	<li>WORKLOAD must be set to either TPCH or TPCR in the makefile
+	<li>unneeded reference to part table removed from q21 template
+	</ul>
+<li><A NAME="101">Changes as of 06/04/99</A><ul>
+   <li> version 1.0.1d
+   <li> Restarted version numbering to match specification revisions for
+      TPC-H and TPC-R
+   <li> Corrected answer set for for Q13
+   <li> Corrected parameter substitutions for Q16, Q17, Q19, Q20, Q21, Q22
+   <li> Corrected RNG initialization in qgen.c
+   <li> added adhoc.c adhoc.h to code base to support randomized data sets;
+      currently disabled
+   <li> replaced calls to UnifInt() row_stop with call to NthElement()
+   <li> Corrected a problem that caused small negative money values to 
+		print as a positive value
+   <li> Simplification of PR_xxx macros
+   <li> QGEN building correct parameter logs again
+	</ul>
+
+</ul>
+<p>
+<B>
+******************<br>
+* NOTE NOTE NOTE *<br>
+******************<br>
+Below this line the file refers to TPC-D which was retired in favor of
+TPC-H and TPC-R. Since the new speicifications are numbered  from 1.0.0
+the program version was reset.<br>
+******************<br>
+* NOTE NOTE NOTE *<br>
+******************<br>
+</B>
+<p>
+<ul>
+
+<li><A NAME="201old">Changes as of 01/05/99<ul>
+   <li> version 2.0.1
+   <li> added 1999 to the copyright notice
+   <li> corrected C++ compilation problem
+   <li> sub-select phrasing corrected in Q4, Q21, Q22
+   <li> added support for segmenting update files (contributed by Larry Kemp, HP)
+	</ul>
+<li><A NAME="200old">Changes as of 12/08/98</A><ul>
+   <li> version 2.0.0 
+   <li> Removed permute.h from clean target in makefile.suite
+   </ul>
+<li><A NAME="2008old">Changes as of 11/17/98</A><ul>
+   <li> version 2.0.0 Alpha 8
+   <li> corrected o_custkey overrun bug
+   <li> removed upper bound on -C command option
+   <li> added static permute.h to distribution to match the specification
+   </ul>
+<li><A NAME="2007old">Changes as of 10/23/98</A><ul>
+   <li> version 2.0.0 Alpha 7
+   <li> removed references to DSS_SEED and SEED_TAG
+   <li> minor query template cleanup
+   <li> V2 answer sets added
+   <li> correction to hd_sparse for SF &gt 300
+   <li> added static declaration to row types in gen_tbl to fix update problem
+   <li> permuted params to Q22
+</ul>
+<li><a name="2006">Changes as of 5/20/98</a><ul>
+   <li>version 2.0.0 Alpha6b
+   <li>removed trailing apostrophe from dists.dss nouns for Tandem loader
+   <li>corrected mk_sparse() problem with alpha6
+   <li>added 64b support for NCR/Metaware
+   <li>corrected generation of parent/child tables in parallel
+   <li>renamed ORDER table to ORDERS table
+   <li>revision of DBGEN synced with revision of 2.0 specification
+   <li>portability changes to process termination provided by John Matzka
+   <li>portability changes for Watcom C provided by Andrew Eisenberg
+   <li>standardized query template format
+   <li>queries now include a consistant header format
+</ul>
+<li>Changes as of 4/28/98<ul>
+   <li>version 2.0.0 Alpha5
+   <li>NO RELEASE OF ALPHA 5 ; skipped to sync spec/DBGEN revision levels</ul>
+<li><a name="2004">Changes as of 7 April 98</a><ul>
+<li>version 2.0.0 Alpha4
+<li>Query template corrections for Q9, Q12, Q15a, Q22
+<li>Parallel generation of parent/child tables fixed
+<li>Minor corrections to dists.dss
+<li>Portability changes for HPUX
+</ul>
+
+<li><a name="2003">Changes as of 3/24/98</a><ul>
+<li>version 2.0.0 Alpha3
+<li>include substitution parameters for Q22
+<li>correct substitution parameters for Q16 under AIX
+<li>include permute.h until unix/NT makefile fix
+<li>correct orderkey generation
+</ul>
+<li><a name="2002">Changes as of 3/20/98</a><ul>
+<li> version 2.0.0 Alpha2
+<li> correct runtime malloc error from bad INIT_HUGE macro
+<li> improve pseudo text distribution in comments
+<li> fix problem with parallelism of data gen
+<li> re-enable generation of parent/child tables
+<li> remove recombinaton code for parallel flat files
+</ul>
+<li><A NAME="2001old">Changes as of 3/11/98</A><UL>
+<li> version 2.0.0 Alpha1
+<li> removed the TIME table
+<li> removed the need for seed files
+<li> made 1GB the validation database size
+<li> add pseudo text support in comments
+<li> correct character selection in a_rnd()
+<li> correct population of P_NAME
+<li> removed unclaimed variants
+<li> added new queries 18-22, replaced Q13
+</ul><li><A NAME="131old">Changes as of 2/6/98</A><UL>
+<li> version 1.3.1
+<li> Revised 64 bit support to clean up bcd2_bin()and mk_sparse()
+<li> Add 64b support for NT
+</ul><li>Changes as of 12/31/97<ul>
+<li> version 1.3.0
+<li> support for seed generation &gt 1TB (data gen still to be tested)
+<li> rework of 64b support
+<li> added bcd support for subtraction, comparison, modulo
+<li> added 1998 to the copyright notice
+<li> clarified comments in dists.dss
+<li> corrected substitution problem in Q11
+<li> standardized fopen() error messages with OPEN_CHECK()
+<li> introduced PATH_SEP in config.h to allow changes in path separators
+</ul><li>Changes as of 12/15/96<ul>
+<li> version 1.2.0
+<li> corrected typos in queries 8a, 8c, 8d, 11a, 12F and 14F, 17a
+<li> added variant 15c
+<li> defined MAX_SCALE and MIN_SCALE; issued error messages for SF &gt& 1000
+since implementation is incomplete
+<li> seed file generation can now be resumed with dbgen -R &lt n &gt ...
+<li> corrected slight compile bug under Solaris 2.5.1
+<li> documented compile problems under SunOS
+</ul><li><a name="xxx">Changes as of 8/1/96</a><ul>
+<li> version 1.1.0D
+<li> included new variants for queries 8 and 15
+<li> re-introduced answer sets in the source tree
+</ul><li><a name="xxx">Changes as of 5/1/96</a><ul>
+<li> version 1.1.0C
+<li> unified version numbering of DBGEN and QGEN
+<li> updated BUGS list
+<li> removed FAQ from soft appendix; web site will keep the current
+version of the FAQ
+<li> added 1996 to the copyright notice
+<li> corrected bug in PR_DATE macro; NO CHANGE TO DATA SET
+<li> properly initialize param values for cleaner logging
+<li> adjusted output format of Q11 partam to allow scaling to 1TB
+<li> corrected typos in variant 14c
+<li> corrected data type for YEAR in variant 8c
+<li> corrected typos in variant 10a
+<li> added variant 8d
+</ul><li><a name="xxx">Changes as of 1/23/96</a><ul>
+<li> qgen version 1.1.0B
+<li> include support for ANSI semantics
+<li> improved patch for seed sensetivity
+</ul><li><a name="xxx">Changes as of 1/23/96</a><ul>
+<li> updated BUGS list
+<li> dbgen version 1.1.0A
+<li> patch to limit BCD2 fields to 12 characters for columnar output
+<li> qgen version 1.1.0A
+<li> patch to fix the "unknown flag" problem
+<li> patch to fix the seed sensetivity problem
+</ul><li><a name="xxx">Changes as of 12/19/95</a><ul>
+<li> updated BUGS list
+<li> dbgen version 1.1.0
+<li> upped default value of MAX_CHILDREN to 1000
+<li> corrected naming of detail tables in incremental load
+<li> corrected range delete output
+<li> forced delete files to truncate existing files
+<li> removed fixed size tables from seed generation
+<li> corrected overflow problem with large scale seed generation
+<li> allow date generation as MM-DD-YY based on config.h #define
+<li> correct truncation problem with columnar output in PR_VSTR()
+<li> added support for Windows NT
+<li> added PLATFORM macro to makefile, removed platform defines from
+config.h
+<li> removed MAX_CHILDREN define from config.h (set to 1000 in dss.h)
+<li> qgen version 1.1.0
+<li> correct SET_OUTPUT macro to TDAT
+<li> use %ld in output for q17; portability
+<li> add support for SQLSERVER database dialect
+<li> add support for SYBASE database dialect
+<li> adjust parameter ranges for Q1, Q3, Q6
+<li> add -T/-t option to usage summary
+<li> added support for Windows NT
+</ul><li><a name="xxx">Changes as of 09/01/95</a><ul>
+<li> qgen version 1.0.1
+<li> formalized version numbering
+<li> -p now generates correct query permutations
+<li> added separate verion number for qgen
+<li> corrected Q3 substitution problem
+<li> updated permissible range for Q10
+<li> corrected rowcount_dflt and the MAX row indicator (-1)
+<li> expanded param logging to include all possible parameters
+<li> allowed qgen's -d option to be used at all scale factors
+<li> made parameter substitution permutation-independent
+<li> added qgen suppport for END_TRAN (-E) and DFLT_NUM (-N)
+<li> correct handling of :n directive
+<li> added more complete explanation of QGEN to README
+<li> rename of random to rndm, for portability
+<li> dbgen version 1.0.1
+<li> formalized version numbering
+<li> inclusion of SF=1 seed file
+<li> correct typo in usage() update example
+<li> patch to driver.c to allow correct updates
+<li> documentation change to README to clarify seed/stage/update
+intereaction
+<li> corrected minor glitch in "open failed" error msg in print.c
+<li> added missing line continuation to makefile.suite
+<li> seed files are now based on scale factor and number of generators
+<li> seed files now hold seeds for one "step" of a given build
+<li> clean up of parallel load routines
+<li> inclusion of faster seed generation routines from Susanne Englert
+<li> removed the -E(xisting) option
+<li> assure proper scaling of O_CUSTKEY
+<li> corrected default update percentage
+<li> proper handling of child tables with '-O f'
+<li> removed seed files from the distribution
+<li> modified rpb_routine() to limit contribution of partkey in
+retailprice
+<li> added '-S(tep)' option to allow multi-stage loads
+<li> roll in of 32 bit speed_seed routines from Dick Shelton
+<li> miscelaneous typo corrections in the documentation
+<li> cleanup of usage output
+</ul><li><a name="xxx">Changes as of 05/08/95</a><ul>
+<li> version 1.0
+<li> add Teradata defines to tpcd.h for QGEN
+<li> add :c to query templates for database CONNECT syntax
+<li> add examples of DBGEN and QGEN usage to README
+<li> add -T option to qgen to allow time table usage
+<li> query template names only requre .sql suffix, rest is arbitrary
+</ul><li><a name="xxx">Changes as of 03/13/95</a><ul>
+<li> version 9.1
+<li> surround DBNAME with ifndef in config.h
+<li> remove -DDBNAME from makefile.suite
+<li> sync varchar handling with 9.1 draft
+</ul><li><a name="xxx">Changes as of 02/21/95</a><ul>
+<li> version 9.0a
+<li> fixed bug in qgen that incorrectly included rnd.h
+<li> included revised DDL with Changes for char/varchar and l_quantity
+<li> updated DBGEN help message to include new single table options for
+order/lineitem and part/partsupp
+<li> included handling for multi-set seed files TPCDSEED.xxx
+<li> generated seeds up through 400GB; headed to 1TB!
+<li> ANSI lint cleanup; more needed
+<li> UF2 now defaults to key lists; use "-O r" to generate key ranges
+also note, this routine this routine does NOT use the BCD2_*
+routines. As a result, it WILL fail if the keys being deleted
+exceed 32 bits. Since this would require ~660 update iterations,
+this seems an acceptable oversight
+</ul><li><a name="xxx">Changes as of 01/19/95</a><ul>
+<li> version 9.0
+<li> allowed command line seeding of RNG for QGEN
+<li> order and number of params in QGEN now matches
+presentation in spec
+<li> fixed bug in time table format of O_ORDERDATE
+<li> changed l_QUANTITY to FLOAT in dss.ddl
+<li> reworked QGEN options to be more useful
+<li> allowed creation of sparse keys beyond 32 bits (for 1TB)
+<li> removed unused '#ifdef' and associated code
+<li> allowed independent generation of master/detail tables
+(eg, order/lineitem)
+</ul><li><a name="xxx">Changes as of 12/06/94</a><ul>
+<li> version 8.6
+<li> fixed renaming of flat files for child tables
+<li> various documentation fixes
+<li> added naming convention section to Porting.Notes
+<li> added -DIBM flag to config.h
+<li> synced up QGEN with draft 8.1
+</ul><li><a name="xxx">Changes as of 10/25/94</a><ul>
+<li> version 8.5a
+<li> corrected bug in columnar output of pr_supp
+<li> added pr_drange to generate a list of order keys to be
+deleted instead of generating SQL
+<li> added '-O d' to generate range delete as SQL
+<li> updated default values for QGEN to sync with spec 8.1
+<li> corrected MK_SPARSE to reflect groups of 8
+<li> corrected a bug in o_orderstatus
+<li> regenerated seed files for SF in [1,10]
+<li> ANSI cleanup (primarily function declarations)
+</ul><li><a name="xxx">Changes as of 10/11/94</a><ul>
+<li> version 8.5
+<li> remove deletes/inserts to other than order/lineitem
+<li> increased cardinality for part.type part.container
+<li> '-r' argument is now integer; percentage in basis points
+<li> initial roll-in of new update scheme
+<li> added BBB comments to supplier table
+</ul><li><a name="xxx">Changes as of 9/27/94</a><ul>
+<li> version 8.4
+<li> all money calculations now use integer math. This should
+bring everyone's data sets into exact aggreement.
+</ul><li><a name="xxx">Changes as of 9/21/94</a><ul>
+<li> version 8.3b
+<li> fixed handling of MAX_STREAM
+<li> added floor function to RPRICE bridge
+<li> misc lint cleanup (type fixes, new prototypes, etc.)
+<li> MONEY format becomes lf for DOS
+<li> further cleanup of PR_VSTR and its length argument
+<li> change to parameter generation for Q6 to allow for float
+discount
+</ul><li><a name="xxx">Changes as of 9/15/94</a><ul>
+<li> version 8.3a
+<li> isolated MONEY format for Unisys (Lf) using DOS
+<li> make sure all arguments to MAKE_MONEY were double's
+<li> rolled in NEW_PTEXT to allow Berni to experiment
+</ul><li><a name="xxx">Changes as of 9/12/94</a><ul>
+<li> version 8.3
+<li> added -T n and -T r to usage to match getopt() and README
+<li> changed PR_MONEY to remove leading blanks
+<li> included revised DDL from Berni
+<li> included some MVS portability fixes in re malloc.h
+<li> cleaned up error messages in qgen and made #define ofp usage
+universal
+<li> additional DOS portability changes
+<li> added {c,a}len to provide specific length for columnar
+output of varchar
+<li> added PR_VSTR to handle varchar printing under MVS
+<li> fixed bit masking in a_rnd and cleaned up prototype match
+with V_STR
+<li> PR_MONEY now used %Lf
+<li> added revised pseudo text under NEW_PTEXT ifdef for
+experiments
+</ul><li><a name="xxx">Changes as of 9/09/94</a><ul>
+<li> version 8.2
+<li> l_discount and l_tax are now fractional (per teleconference)
+<li> money calculations moved to scaled integer math to clean up
+answer sets
+<li> changed PR_FLT() to PR_MONEY to clarify usage
+<li> portability changes for SYBASE: dbname <li>> db_name
+STATUS <li>> DBGEN_STATUS
+<li> added nations2 to dists.dss to handle qgen needs for now
+<li> reintroduced #ifndef DOS
+<li> reintroduced U2200 define to control kill_load()
+<li> broke out nation and region separately in -T option
+<li> updated dss.ddl based on mail from Berni
+</ul><li><a name="xxx">Changes as of 8/31/94</a><ul>
+<li> version 8.1
+<li> scaling for clerks needed to be 1000 (was 100)
+<li> added qgen parameter for scale
+<li> changed qgen parameter from s)tream to p)ermutation
+<li> synced qgen paramter values with 8.0 spec
+<li> corrected duplications in dists.dss
+</ul><li><a name="xxx">Changes as of 8/24/94</a><ul>
+<li> version 8.0
+<li> added sparse keys to lineitem/order
+<li> added varchar generation for comments/addresses
+<li> added variable lineitems/orders
+<li> removed ifdef for normalized code_tables
+<li> included code for parameter generation and template-&gtEQT
+routines
+<li> updated README and Porting.Notes to reflect QGEN
+<li> included DDL and RI examples from Berni
+</ul><li><a name="xxx">Changes as of 6/15/94</a><ul>
+<li> version 7.0b (numbers now match spec revsion)
+<li> rework of code tables to properly map nation/region; when
+compiled with -DCODE_TABLES distributions are taken from
+code.dss and two additional fields are generated for
+customers and suppliers, [cs]_ncode and [cs]_rcode,
+immediately following [cs]_region
+<li> replaced ifdef's around DEAD_DATA with opposites. DEAD_DATA
+is now the default
+<li> worked through code to see that it conformed to 7.0
+specification
+<li> adjusted scale factors/rowcounts for 1 GB == sf1
+<li> brought help message in line with current code
+<li> fixed order per customer at 10
+<li> make suppkey scalable in lineitem/partsupp
+</ul><li><a name="xxx">Changes as of 4/25/94</a><ul>
+<li> version 1.5
+<li> added the customers with no orders; Compile with -DDEAD_DATA
+to activate the change.
+<li> added the code table for nation and region;
+Compile with -DCODE_TABLES to activate the change.
+</ul><li><a name="xxx">Changes as of 3/17/94</a><ul>
+<li> version 1.41
+<li> completed implementation of JULIAN_DAY after talks with Berni
+<li> misc cleanup in usage/README files
+<li> removed all tabs and capped line length at 75
+<li> added -n option to allowing naming of inline-loaded database
+</ul><li><a name="xxx">Changes as of 3/16/94</a><ul>
+<li> version 1.4
+<li> prottyped julian day/month for query re-write work. Compile
+with -DJULIAN_DAY to enable
+<li> removed gen_times() from driver.c
+<li> added VMS ifdef to config.h to clean up fork/signal issues
+<li> added ICL ifdef to config.h to clean up getopt() issues
+<li> changed header file references to config.h from machine.h
+</ul><li><a name="xxx">Changes as of 3/2/94</a><ul>
+<li> version 1.31
+<li> corrected format of C_NAME to match S_NAME and O_CLERK
+<li> re-allowed fractional scale factors &lt 1 (updates not
+contiguous)
+<li> added DSS_CONFIG environemnt variable
+<li> reworked read_dist() to look for DSS_DIST in DSS_CONFIG
+<li> updated the README file
+</ul><li><a name="xxx">Changes as of 2/16/94</a><ul>
+<li> version 1.3
+<li> added command line options for parallel load and data set
+expansion
+<li> changed dists.dss delimiter to | for portability
+<li> limited scale factors to integer values
+<li> added command line option for seed file generation
+<li> added all seed files to distribution for SFs 1 - 10
+<li> moved machine.h to config.h and added MAX_CHILDREN define
+<li> added 'f' flag to options to allow renaming of output files
+<li> added generation of SQL delete statements to match updates
+(Note: updates are still single-threaded; -C is cleared
+by -U)
+<li> corrected field sizing in dsstypes.h typedefs to match v 6.4
+<li> update percentage default set to 1%
+</ul><li><a name="xxx">Changes as of 12/3/93</a><ul>
+<li> version 1.2
+<li> added command line option to adjust update percentage
+<li> fixed update gneration for proper primary key ordering
+<li> renamed UUSR/PRC to RUSSIA/CHINA in dists.dss
+<li> cleaned up phone number generation to be consistant regard-
+less of order of evaluation
+<li> adjusted size of lineitem comment to bring data in line with
+100 MB == SF=1
+</ul><li><a name="xxx">Changes as of 10/15/93</a><ul>
+<li> added command line option for update data creation
+<li> miscelaneous porting and cleanup changes
+<li> reworked table generation to allow reuse for updates
+<li> added comment field to tdefs structure
+<li> added load_state and store_state to sync data gen and
+update gen
+</ul><li><a name="xxx">Changes as of 7/26/93</a><ul>
+<li> combined loader and header stubs in load_stubs.c
+<li> separated Revision History (this file) from README
+<li> simplified makefile
+<li> removed redundancies from colors distribution
+<li> added getopt() for portability
+<li> created Porting.Notes
+<li> adjusted scaling rules
+<li> added help option to the command line
+</ul><li><a name="xxx">Changes as of 2/26/93</a><ul>
+<li> combined all typedefs in one header: dsstypes.h
+<li> combined flat file generation in print.ec
+<li> combined typedef population in build.ec
+<li> added -P to control rowcnt scaling (P for percentage)
+<li> added -D option for Direct data generation and added
+appropriate hooks in tdefs[] structure
+<li> added -F option for flat file generation
+<li> reused -T option (use -P 0.1 to build test size database)
+now accepts suboptions c,o,p,s for single table builds.
+<li> dropped -M option (scaling is now by rowcount)
+<li> added -O option for optional controls. Currently defined:
+-O t <li> generate optional time table a join fields in
+order/lineitem
+-O h <li> generate headers for flat file output
+-O m <li> generate fixed column-length output
+<li> removed dynamic memory allocation, redundant calls to
+UnifInt, etc to improve performance
+</ul><li><a name="xxx">Changes as of 1/12/92</a><ul>
+<li> julian() changed to handle orders -&gt orderdate correctly
+<li> rflag distributions corrected in dists.dss
+<li> sea, gold removed from color distribution to clean up substring
+problems
+<li> part-&gt number and supplier-&gt adjusted for 1-based indexing
+<li> time-&gt day changed to be day of month, not day of year
+<li> t.week changed to be week in year, not day of week
+</ul><li><a name="xxx">Changes as of 11/18/92</a><ul>
+<li> checked line length and tab for transmission
+<li> another chapter in the portability wars. added #include
+"machine.h" to dss.h (which is included by everyone else). Any
+machine particular porting changes should go here.
+<li> fixed fixed-field formats to prevent double printing
+<li> expanded PR_FLT formats to %010.2
+</ul><li><a name="xxx">Changes as of 10/21/92</a><ul>
+<li> added fixed format and column header handling; users of headers
+will have to define the header functions to be called in
+int (*tdefs.header)()
+</ul><li><a name="xxx">Changes as of 10/09/92:</a><ul>
+<li> added ansi prototypes and recompiled with gcc -ansi. users may
+need to change the CC definition in the makefile and the contents
+of CFLAGS to reflect their particular ansi compiler.
+<li> replaced all int references with long
+<li> replaced all float references with double
+<li> found and fixed odate/julian problem TS mentioned in 10/09 phone
+call
+
+</ul><li><a name="xxx">Changes as of 9/09/92:</a><ul>
+<li> Park/Miller random number generator included
+<li> clerk scaling changed to 100 * scale
+<li> parts.name always built from 5 selections from colors set
+<li> test scaling changed to ~60MB (TEST_SCALING == 10)
+<li> logarithmic scaling removed
+<li> mfgcost removed and retail/supplier cost bounds adjusted
+<li> agg_str memory leak fixed
+<li> independent RNG streams on a per column basis
+</ul>
+</ul>
+This is the revised data generator for DSS.
+
+The rewrite tried to accomplish three things:
+<ol>
+<li>identify and isolate
+all the implicit assumptions about limits, bounds, ranges, distributions, etc.;
+<li>standardize the way any given table was generated/
+printed to ease understanding and maintenance;
+<li>bring the generator
+in line with the current work of the committee and the excellent spec
+the Indira put together;
+<li> provide an easy way to adjust distributions, string contents and to facilitate experimentation to get a
+better idea of the impact of data population changes.
+</ol><p>
+
+The files included are:<p>
+<dl>
+<dt>driver.c
+<dd>main and the calling routines for the generator
+<dt>dist.c
+<dd>should really be named dss_util.c; misc routines
+<dt>customer.c     <dd> generation and print routines for customer table
+<dt>orders.c    <dd>        ""             ""      order table
+<dt>parts.c        <dd>            ""             ""      parts/partsupp
+<dt>suppliers.c    <dd>            ""             ""      suppliers table
+<dt>time.c         <dd>            ""             ""      time table
+<dt>customer.h     <dd> associate header files; contain structure
+definitions
+<dt>dss.h                  <dd>dss.h holds the large number of assumptions and
+<dt>orders.h               <dd>values that have been used as IFDEFs.
+<dt>parts.h  <dd>
+<dt>suppliers.h<dd>
+<dt>time.h   <dd>
+<dt>dists.dss   <dd> string selections and weights; used to build
+distributions
+
+</dl>
+<p>
+Running make will create an executable (using the compiler flags in
+CFLAGS, the ld flags in LDFLAGS and the libraries in LIBS [-O, -s,
+and -lm by default]) which will create flat files suitable for dbload.
+
+
+
+</BODY>
+
+</HTML>
diff --git a/data/ssb/dbgen/load_stub.c b/data/ssb/dbgen/load_stub.c
new file mode 100644
index 0000000..e3339b5
--- /dev/null
+++ b/data/ssb/dbgen/load_stub.c
@@ -0,0 +1,281 @@
+/*****************************************************************
+ *  Title:      load_stub.c
+ *  Sccsid:     @(#)load_stub.c	2.1.8.1
+ *  Description:
+ *              stub routines for:
+ *          inline load of dss benchmark
+ *          header creation for dss benchmark
+ *
+ *****************************************************************
+ */
+
+#include <stdio.h>
+#include "config.h"
+#include "dss.h"
+#include "dsstypes.h"
+
+int 
+close_direct(void)
+{
+    /* any post load cleanup goes here */
+    return(0);
+}
+
+int 
+prep_direct(void)
+{
+    /* any preload prep goes here */
+    return(0);
+}
+
+int 
+hd_cust (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No header has been defined for the customer table\n");
+
+    return(0);
+}
+
+int 
+ld_cust (customer_t *cp, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No load routine has been defined",
+            "for the customer table");
+
+    return(0);
+}
+
+int 
+hd_part (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No header has been defined for the part table\n");
+
+    return(0);
+}
+
+int 
+ld_part (part_t *pp, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No load routine has been defined for the part table\n");
+
+    return(0);
+}
+
+int 
+ld_psupp (part_t *pp, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No load routine has been defined for the",
+            "psupp table\n");
+
+    return(0);
+
+}
+
+
+int 
+hd_supp (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No header has been defined for the supplier table\n");
+
+    return(0);
+}
+
+int 
+ld_supp (supplier_t *sp, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No load routine has been defined",
+            "for the supplier table\n");
+
+    return(0);
+}
+
+
+int 
+hd_order (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No header has been defined for the order table\n");
+
+    return(0);
+}
+
+int 
+ld_order (order_t *p, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No load routine has been defined",
+            "for the order table");
+
+    return(0);
+}
+
+ld_line (order_t *p, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No load routine has been defined",
+            "for the line table");
+
+    return(0);
+}
+
+
+
+int 
+hd_psupp (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No header has been defined for the",
+            "part supplier table");
+
+    return(0);
+}
+
+
+int 
+hd_line (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No header has been defined for the lineitem table\n");
+
+    return(0);
+}
+
+int 
+hd_nation (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No header has been defined for the nation table\n");
+
+    return(0);
+}
+
+#ifdef SSBM
+#else
+int 
+ld_nation (code_t *cp, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No load routine has been defined",
+            "for the nation table");
+
+    return(0);
+}
+
+int 
+hd_region (FILE *f)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("No header has been defined for the region table\n");
+
+    return(0);
+}
+
+int 
+ld_region (code_t *cp, int mode)
+{
+    static int count = 0;
+
+    if (! count++)
+        printf("%s %s\n",
+            "No load routine has been defined",
+            "for the region table");
+
+    return(0);
+}
+
+int 
+ld_order_line (order_t *p, int mode)
+{
+    ld_order(p, mode);
+    ld_line (p, mode);
+
+    return(0);
+}
+
+int 
+hd_order_line (FILE *f)
+{
+    hd_order(f);
+    hd_line (f);
+
+    return(0);
+}
+
+int 
+ld_part_psupp (part_t *p, int mode)
+{
+    ld_part(p, mode);
+    ld_psupp (p, mode);
+
+    return(0);
+}
+
+int 
+hd_part_psupp (FILE *f)
+{
+    hd_part(f);
+    hd_psupp(f);
+
+    return(0);
+}
+#endif
+
+#ifdef SSBM
+int
+ld_date (date_t *d, int mode)
+{
+    /*do nothing for now*/
+    return(0);
+}
+
+#endif
+
+
+
+
+
+
diff --git a/data/ssb/dbgen/load_stub.o b/data/ssb/dbgen/load_stub.o
new file mode 100644
index 0000000000000000000000000000000000000000..a30638600e6732f5376411c496494e6482852588
GIT binary patch
literal 6680
zcmds*e~4676vywVyRK&IW|pR=uhLv8hnaPCS0P*3%E8QEu_P(&&d$7THg{*HnK!ap
znPpLF3mZ`o2Kk49^iTaI1^rmcKZHUe41x%X{wXjpBMJ%ZeD6DF_Uv_Dt0n{;c=z1<
zdFPyW?tAy$cV?cxcjJANVlg8UGi%M*NT@N}Zyf7)*lvfJX&TLMGZ&tJALGx)_YTzo
z4sVIQbOSoij|TOjS?a)<hScy~fIp$JG<D!agAFi&+qLjW4cm~Z;guMDf^088(iJ9f
zg>L_O-v6TnuHICxUZg8r7Ex~Dn}XGCHWj>_pU(OvFPAQRy}qCKvVLEG-p_h{g_0NK
zd@oZe2Zceh2-3X+zV)(x1BG<fD-|jMSzqMMW|tDbm@Wm?*p$yd#;s;G&THa)#P2fl
zDp!idf&Po<J<<ItlyE!wW=(v5!;B`j^9JZX_Xqx<gtm^C;FnJa{e?VxMLg6cWp7D&
zx+!ALq2+WVrsMC!7}xU@Tja=BvJYsyuQBU8JJ)#4UA>ijQ1Oz<cxybdJXx{B<i3`c
zc%pR~V}4LmNH@`e?SB2HE;GrPQLxE$KxT)^#P)78v7x5e{25alMrg~1&jLO-oi$j(
zVer3)SsQe^v8k)E!N%Rmd;{|w^MJW)|8fLB8WHEi2>a6!_GcpOr|<{W&2vEnZ;jv^
znY;7KM(`(@%l+l_YATpcF~44@b#ijeiU7|TPkOvyPmK|m=O`8^AHeTqbK#4aH#;0-
z*e`ZC;(~h)Z)YxRO5%4gpX=DCn9KSU`z_3y9Q*CeXE?lvd4t1q%q6bqB6BvUnS?TA
zl`Wd&l+{n!wtgJL;8{!R4Cbxn1v{uWb2OxWxc2KzkHcTF<2p0q@Yn6Q-Yj(ZVYWZ%
z@FUFsbokqLTyO4h_&aQ$;590z-(&ku0j!TRf5_qCJ6dlZbNHurO!IZ(hk4hT7ae|z
z?MEGscYPMEI1zc5eqb(R(Z4X4b!FdGV!zWNYl^y6IZr*rzn{ql)|2Rxwd`jShy818
zC-LVn|H9#mm`^4Rk@(HbA7CzV@H`OzImf=8`6<VK9rHTc_;vgZ%r`rH8}m0DzJvLX
z%q5=;^LhM15kAO#D|5O32GvP-IjHo;GbU4@?^rz9BAujk65LtMx}9|!>s72*vTkL)
zf^{__j}s=ovC}q5^jgrYK+}q5C7M-e+R(J4S&b$cx>z5|62y8Xty|LiB`KGILfP-m
z_LuxjV2UNbDE(Zv8^xa~R)Vq_;6ZnHvDBXr`nof@U4|_1oyW<l#y1})tfmt(w)Fxm
zs~uDeHiVc62ul{&X48Q!Ip>9sAK!ob?U#N2wg@i!7VNL1<F5;$*pisW*cX>5zLokA
ziQgO!&BLr+UQm1%^(Fpp#jjU9+-Jl1cz4A<+)u-NmMR|ZgJbzD=D0^$OFqIsP#iyB
zCH}|E-T8i{?6Jf>9)BCb&nk{@t;EN^199U|qxT0QdCGTlRs>(F_#E<+_;JPOD!x*2
zeLwGJj(qg}d{EhAzK8|c8NnY{{3f!Ke0Hn&di@+!_J}Y0^Q#g3nBrL568~Mr_4+xk
zxSsDQb9cU{l|Ay2z5crheopcEDxW`8d_CW5@L);g`AXd~JAyA$9BV=5n^0WOw@q<9
z-*wF0`F1IL%(nr_kvtN?%ZlGjc9PFt#dSW<D6aE4#N5s2EoF~<a2|3jf`6nqY9h(!
zgyN`Wg@3NN&gX09ZazONd*mZ>E`m>GBb@f=kn7H1j`(^#c*-7Kv0oO!*D8LCvhQSm
zQGBw2ARkWIj$3Uz#^pPmayaf0q{p^n_EHxOIb7<W5r<3tbIjpV|BO0Z>Yr~MF7=Pz
zS1~WCe;U;L2rl)H=WwZiP(MNB{z(0^#o<!_<Qy*b&whtX{bS<gJ%jY02I-&_>YVgS
z{y;h&_(Oq-)8{&E;=SdviI)l(z&shJl0*-wAjuR42mL&G<O_ixr(gEV%RxG`i^q@V
zEAe8fQ1nZ|9&$x|KiAz?N)P(sH2-z$Q3OoY{pHg^46=jQVK@qHr4Nlg5^J=B4lSuh
z9&upoi@1*LNAlgdow;)vbJztOjvot;jQyR9n6CMo4CK8K2jr?HNdv@e(UAH_*#E3^
zLCIfm2iey0AE^ri?h9456+h9N**{UW2s@2@-=w~i)8Z$3JNwId(QDiX$|uB6>~hqZ
zN(bH-EgkIdX-IvUzu^7UK}^@bhy6#K3%dSKkZ;ZP!yDCp_Q(3co+kdXerjEt%*<I#
uisXBK;P}7fRyth&X~f*ML&$uI3zAfv{GFk#scZS4MDa;-FjUpn_5T~2kQ2!O

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/makefile b/data/ssb/dbgen/makefile
new file mode 100644
index 0000000..86cb0fe
--- /dev/null
+++ b/data/ssb/dbgen/makefile
@@ -0,0 +1,127 @@
+# @(#)makefile.suite	2.1.8.1
+################
+## CHANGE NAME OF ANSI COMPILER HERE
+################
+CC      = gcc
+# Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata)
+#                                  SQLSERVER, SYBASE
+# Current values for MACHINE are:  ATT, DOS, HP, IBM, ICL, MVS, 
+#                                  SGI, SUN, U2200, VMS, LINUX
+# Current values for WORKLOAD are:  SSBM, TPCH, TPCR
+DATABASE=DB2 
+MACHINE =LINUX 
+WORKLOAD =SSBM 
+#
+# add -EDTERABYTE if orderkey will execeed 32 bits (SF >= 300)
+# and make the appropriate change in gen_schema() of runit.sh
+CFLAGS	= -DDBNAME=\"dss\" -D$(MACHINE) -D$(DATABASE) -D$(WORKLOAD)
+LDFLAGS = 
+# The OBJ,EXE and LIB macros will need to be changed for compilation under
+#  Windows NT
+OBJ     = .o
+EXE     =
+LIBS    = -lm
+#
+# NO CHANGES SHOULD BE NECESSARY BELOW THIS LINE
+###############
+TREE_ROOT=/tmp/tree
+#
+PROG1 = dbgen$(EXE)
+PROG2 = qgen$(EXE)
+PROGS = $(PROG1) $(PROG2)
+#
+HDR1 = dss.h rnd.h config.h dsstypes.h shared.h bcd2.h
+HDR2 = tpcd.h permute.h
+HDR  = $(HDR1) $(HDR2)
+#
+SRC1 = build.c driver.c bm_utils.c rnd.c print.c load_stub.c bcd2.c \
+	speed_seed.c text.c permute.c
+SRC2 = qgen.c varsub.c 
+SRC  = $(SRC1) $(SRC2)
+#
+OBJ1 = build$(OBJ) driver$(OBJ) bm_utils$(OBJ) rnd$(OBJ) print$(OBJ) \
+	load_stub$(OBJ) bcd2$(OBJ) speed_seed$(OBJ) text$(OBJ) permute$(OBJ)
+OBJ2 = build$(OBJ) bm_utils$(OBJ) qgen$(OBJ) rnd$(OBJ) varsub$(OBJ) \
+	text$(OBJ) bcd2$(OBJ) permute$(OBJ) speed_seed$(OBJ)
+OBJS = $(OBJ1) $(OBJ2)
+#
+SETS = dists.dss 
+DOC=README HISTORY PORTING.NOTES BUGS
+DDL  = dss.ddl dss.ri
+OTHER=makefile.suite $(SETS) $(DDL) 
+# case is *important* in TEST_RES
+TEST_RES = O.res L.res c.res s.res P.res S.res n.res r.res
+#
+DBGENSRC=$(SRC1) $(HDR1) $(OTHER) $(DOC) $(SRC2) $(HDR2) $(SRC3)
+QD=1.sql 2.sql 3.sql 4.sql 5.sql 6.sql 7.sql 8.sql 9.sql 10.sql \
+	11.sql 12.sql 13.sql 14.sql 15.sql 16.sql 17.sql 18.sql \
+	19.sql 20.sql 21.sql 22.sql
+VARIANTS= 8a.sql 12a.sql 13a.sql 14a.sql 15a.sql 
+ANS   = 1.ans 2.ans 3.ans 4.ans 5.ans 6.ans 7.ans 8.ans 9.ans 10.ans 11.ans \
+	12.ans 13.ans 14.ans 15.ans 16.ans 17.ans 18.ans 19.ans 20.ans \
+	21.ans 22.ans
+QSRC  = $(FQD) $(VARIANTS)
+ALLSRC=$(DBGENSRC) 
+TREE_DOC=tree.readme tree.changes appendix.readme appendix.version answers.readme queries.readme variants.readme
+JUNK  = 
+#
+all: $(PROGS)
+$(PROG1): $(OBJ1) $(SETS) 
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ1) $(LIBS)
+$(PROG2): permute.h $(OBJ2) 
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ2) $(LIBS)
+clean:
+	rm -f $(PROGS) $(OBJS) $(JUNK)
+lint:
+	lint $(CFLAGS) -u -x -wO -Ma -p $(SRC1)
+	lint $(CFLAGS) -u -x -wO -Ma -p $(SRC2)
+
+tar: $(DBGENSRC) 
+	tar cvhf $(PROG1).tar $(DBGENSRC) 
+dbgenshar: $(DBGENSRC)
+	shar -o dbgen.shar $(DBGENSRC)
+zip: $(DBGENSRC)
+	zip dbgen $(DBGENSRC)
+tree: $(DBGENSRC) $(FQD) $(VARIANTS) $(TREE_DOC) $(ANS)
+	rm -rf $(TREE_ROOT)
+	mkdir $(TREE_ROOT) 
+	mkdir $(TREE_ROOT)/appendix 
+	mkdir $(TREE_ROOT)/appendix/queries 
+	mkdir $(TREE_ROOT)/appendix/variants 
+	mkdir $(TREE_ROOT)/appendix/dbgen 
+	mkdir $(TREE_ROOT)/appendix/answers 
+	cp tree.readme $(TREE_ROOT)/README
+	cp appendix.readme $(TREE_ROOT)/appendix/README
+	cp answers.readme $(TREE_ROOT)/appendix/answers/README
+	cp queries.readme $(TREE_ROOT)/appendix/queries/README
+	cp variants.readme $(TREE_ROOT)/appendix/variants/README
+	cp tree.changes $(TREE_ROOT)/CHANGES
+	cp appendix.version $(TREE_ROOT)/appendix/VERSION
+	cp $(FQD) $(TREE_ROOT)/appendix/queries
+	cp $(VARIANTS) $(TREE_ROOT)/appendix/variants
+	cp $(DBGENSRC) $(TREE_ROOT)/appendix/dbgen
+	cp $(ANS) $(TREE_ROOT)/appendix/answers
+	(cd $(TREE_ROOT); tar chf - .) |compress > tree.tar.Z
+	(cd $(TREE_ROOT); zip -r  - . )  > tree.zip
+	date > tree.update
+portable:
+	@ for f in $(SRC) $(HDR) ; \
+	do  \
+	expand $$f > /tmp/$$f; \
+	awk 'length > 72 { print FILENAME ":" NR " too long " }' /tmp/$$f ; \
+        rm /tmp/$$f ; \
+	done
+release:
+	@chkout $(SRC) $(HDR)
+	@ for f in $(SRC) $(HDR) ; \
+		do \
+		expand $$f > /tmp/$$f ; \
+		mv /tmp/$$f $$f ; \
+		done
+	@chkin $(SRC) $(HDR)
+
+rnd$(OBJ): rnd.h
+$(OBJ1): $(HDR1)
+$(OBJ2): dss.h tpcd.h config.h
+$(QSRC) $(ALLSRC): 
+	get -r`cat .version` ./SCCS/s.$@
diff --git a/data/ssb/dbgen/makefile.suite b/data/ssb/dbgen/makefile.suite
new file mode 100644
index 0000000..5ab13d1
--- /dev/null
+++ b/data/ssb/dbgen/makefile.suite
@@ -0,0 +1,127 @@
+# @(#)makefile.suite	2.1.8.1
+################
+## CHANGE NAME OF ANSI COMPILER HERE
+################
+CC      = 
+# Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata)
+#                                  SQLSERVER, SYBASE
+# Current values for MACHINE are:  ATT, DOS, HP, IBM, ICL, MVS, 
+#                                  SGI, SUN, U2200, VMS, LINUX
+# Current values for WORKLOAD are:  SSBM, TPCH, TPCR
+DATABASE= 
+MACHINE = 
+WORKLOAD = 
+#
+# add -EDTERABYTE if orderkey will execeed 32 bits (SF >= 300)
+# and make the appropriate change in gen_schema() of runit.sh
+CFLAGS	= -O -DDBNAME=\"dss\" -D$(MACHINE) -D$(DATABASE) -D$(WORKLOAD)
+LDFLAGS = -O
+# The OBJ,EXE and LIB macros will need to be changed for compilation under
+#  Windows NT
+OBJ     = .o
+EXE     =
+LIBS    = -lm
+#
+# NO CHANGES SHOULD BE NECESSARY BELOW THIS LINE
+###############
+TREE_ROOT=/tmp/tree
+#
+PROG1 = dbgen$(EXE)
+PROG2 = qgen$(EXE)
+PROGS = $(PROG1) $(PROG2)
+#
+HDR1 = dss.h rnd.h config.h dsstypes.h shared.h bcd2.h
+HDR2 = tpcd.h permute.h
+HDR  = $(HDR1) $(HDR2)
+#
+SRC1 = build.c driver.c bm_utils.c rnd.c print.c load_stub.c bcd2.c \
+	speed_seed.c text.c permute.c
+SRC2 = qgen.c varsub.c 
+SRC  = $(SRC1) $(SRC2)
+#
+OBJ1 = build$(OBJ) driver$(OBJ) bm_utils$(OBJ) rnd$(OBJ) print$(OBJ) \
+	load_stub$(OBJ) bcd2$(OBJ) speed_seed$(OBJ) text$(OBJ) permute$(OBJ)
+OBJ2 = build$(OBJ) bm_utils$(OBJ) qgen$(OBJ) rnd$(OBJ) varsub$(OBJ) \
+	text$(OBJ) bcd2$(OBJ) permute$(OBJ) speed_seed$(OBJ)
+OBJS = $(OBJ1) $(OBJ2)
+#
+SETS = dists.dss 
+DOC=README HISTORY PORTING.NOTES BUGS
+DDL  = dss.ddl dss.ri
+OTHER=makefile.suite $(SETS) $(DDL) 
+# case is *important* in TEST_RES
+TEST_RES = O.res L.res c.res s.res P.res S.res n.res r.res
+#
+DBGENSRC=$(SRC1) $(HDR1) $(OTHER) $(DOC) $(SRC2) $(HDR2) $(SRC3)
+QD=1.sql 2.sql 3.sql 4.sql 5.sql 6.sql 7.sql 8.sql 9.sql 10.sql \
+	11.sql 12.sql 13.sql 14.sql 15.sql 16.sql 17.sql 18.sql \
+	19.sql 20.sql 21.sql 22.sql
+VARIANTS= 8a.sql 12a.sql 13a.sql 14a.sql 15a.sql 
+ANS   = 1.ans 2.ans 3.ans 4.ans 5.ans 6.ans 7.ans 8.ans 9.ans 10.ans 11.ans \
+	12.ans 13.ans 14.ans 15.ans 16.ans 17.ans 18.ans 19.ans 20.ans \
+	21.ans 22.ans
+QSRC  = $(FQD) $(VARIANTS)
+ALLSRC=$(DBGENSRC) 
+TREE_DOC=tree.readme tree.changes appendix.readme appendix.version answers.readme queries.readme variants.readme
+JUNK  = 
+#
+all: $(PROGS)
+$(PROG1): $(OBJ1) $(SETS) 
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ1) $(LIBS)
+$(PROG2): permute.h $(OBJ2) 
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ2) $(LIBS)
+clean:
+	rm -f $(PROGS) $(OBJS) $(JUNK)
+lint:
+	lint $(CFLAGS) -u -x -wO -Ma -p $(SRC1)
+	lint $(CFLAGS) -u -x -wO -Ma -p $(SRC2)
+
+tar: $(DBGENSRC) 
+	tar cvhf $(PROG1).tar $(DBGENSRC) 
+dbgenshar: $(DBGENSRC)
+	shar -o dbgen.shar $(DBGENSRC)
+zip: $(DBGENSRC)
+	zip dbgen $(DBGENSRC)
+tree: $(DBGENSRC) $(FQD) $(VARIANTS) $(TREE_DOC) $(ANS)
+	rm -rf $(TREE_ROOT)
+	mkdir $(TREE_ROOT) 
+	mkdir $(TREE_ROOT)/appendix 
+	mkdir $(TREE_ROOT)/appendix/queries 
+	mkdir $(TREE_ROOT)/appendix/variants 
+	mkdir $(TREE_ROOT)/appendix/dbgen 
+	mkdir $(TREE_ROOT)/appendix/answers 
+	cp tree.readme $(TREE_ROOT)/README
+	cp appendix.readme $(TREE_ROOT)/appendix/README
+	cp answers.readme $(TREE_ROOT)/appendix/answers/README
+	cp queries.readme $(TREE_ROOT)/appendix/queries/README
+	cp variants.readme $(TREE_ROOT)/appendix/variants/README
+	cp tree.changes $(TREE_ROOT)/CHANGES
+	cp appendix.version $(TREE_ROOT)/appendix/VERSION
+	cp $(FQD) $(TREE_ROOT)/appendix/queries
+	cp $(VARIANTS) $(TREE_ROOT)/appendix/variants
+	cp $(DBGENSRC) $(TREE_ROOT)/appendix/dbgen
+	cp $(ANS) $(TREE_ROOT)/appendix/answers
+	(cd $(TREE_ROOT); tar chf - .) |compress > tree.tar.Z
+	(cd $(TREE_ROOT); zip -r  - . )  > tree.zip
+	date > tree.update
+portable:
+	@ for f in $(SRC) $(HDR) ; \
+	do  \
+	expand $$f > /tmp/$$f; \
+	awk 'length > 72 { print FILENAME ":" NR " too long " }' /tmp/$$f ; \
+        rm /tmp/$$f ; \
+	done
+release:
+	@chkout $(SRC) $(HDR)
+	@ for f in $(SRC) $(HDR) ; \
+		do \
+		expand $$f > /tmp/$$f ; \
+		mv /tmp/$$f $$f ; \
+		done
+	@chkin $(SRC) $(HDR)
+
+rnd$(OBJ): rnd.h
+$(OBJ1): $(HDR1)
+$(OBJ2): dss.h tpcd.h config.h
+$(QSRC) $(ALLSRC): 
+	get -r`cat .version` ./SCCS/s.$@
diff --git a/data/ssb/dbgen/makefile_win b/data/ssb/dbgen/makefile_win
new file mode 100644
index 0000000..1712a11
--- /dev/null
+++ b/data/ssb/dbgen/makefile_win
@@ -0,0 +1,85 @@
+VC="c:/Program Files/Microsoft Visual Studio 9.0/VC"
+WIN_INC="C:\Program Files\Microsoft SDKs\Windows\v6.0A\Include"
+WIN_LIB="C:\Program Files\Microsoft SDKs\Windows\v6.0A\Lib"
+#VC = "C:\Program Files\Microsoft Visual Studio .NET 2003\Vc7"
+VCLIB = $(VC)\LIB
+# @(#)makefile.suite	2.1.8.1
+################
+## CHANGE NAME OF ANSI COMPILER HERE
+################
+CC      =cl.exe
+# Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata)
+#                                  SQLSERVER, SYBASE
+# Current values for MACHINE are:  ATT, DOS, WIN32 HP, IBM, ICL, MVS, 
+#                                  SGI, SUN, U2200, VMS, LINUX
+# Current values for WORKLOAD are:  SSBM, TPCH, TPCR
+DATABASE=DB2 
+MACHINE =WIN32 
+WORKLOAD =SSBM 
+#
+# add -EDTERABYTE if orderkey will execeed 32 bits (SF >= 300)
+# and make the appropriate change in gen_schema() of runit.sh
+CFLAGS	= -DDBNAME=\"dss\" -D$(MACHINE) -D$(DATABASE) -D$(WORKLOAD) /I$(VC)\include /I$(WIN_INC)
+
+#LDFLAGS = -O
+# The OBJ,EXE and LIB macros will need to be changed for compilation under
+#  Windows NT
+OBJ     = .obj
+EXE     = .exe
+LIBS    =$(VCLIB)\libcmt.lib $(VCLIB)\oldnames.lib $(VCLIB)\oldnames.lib $(WIN_LIB)\kernel32.lib
+#
+# NO CHANGES SHOULD BE NECESSARY BELOW THIS LINE
+###############
+#
+PROG1 = dbgen$(EXE)
+PROG2 = qgen$(EXE)
+PROGS = $(PROG1) $(PROG2)
+#
+HDR1 = dss.h rnd.h config.h dsstypes.h shared.h bcd2.h
+HDR2 = tpcd.h permute.h
+HDR  = $(HDR1) $(HDR2)
+#
+SRC1 = build.c driver.c bm_utils.c rnd.c print.c load_stub.c bcd2.c \
+	speed_seed.c text.c permute.c
+SRC2 = qgen.c varsub.c 
+SRC  = $(SRC1) $(SRC2)
+#
+OBJ1 = build$(OBJ) driver$(OBJ) bm_utils$(OBJ) rnd$(OBJ) print$(OBJ) \
+	load_stub$(OBJ) bcd2$(OBJ) speed_seed$(OBJ) text$(OBJ) permute$(OBJ)
+OBJ2 = build$(OBJ) bm_utils$(OBJ) qgen$(OBJ) rnd$(OBJ) varsub$(OBJ) \
+	text$(OBJ) bcd2$(OBJ) permute$(OBJ) speed_seed$(OBJ)
+OBJS = $(OBJ1) $(OBJ2)
+#
+SETS = dists.dss 
+DOC=README HISTORY PORTING.NOTES BUGS
+DDL  = dss.ddl dss.ri
+OTHER=makefile.suite $(SETS) $(DDL) 
+# case is *important* in TEST_RES
+TEST_RES = O.res L.res c.res s.res P.res S.res n.res r.res
+#
+DBGENSRC=$(SRC1) $(HDR1) $(OTHER) $(DOC) $(SRC2) $(HDR2) $(SRC3)
+QD=1.sql 2.sql 3.sql 4.sql 5.sql 6.sql 7.sql 8.sql 9.sql 10.sql \
+	11.sql 12.sql 13.sql 14.sql 15.sql 16.sql 17.sql 18.sql \
+	19.sql 20.sql 21.sql 22.sql
+VARIANTS= 8a.sql 12a.sql 13a.sql 14a.sql 15a.sql 
+ANS   = 1.ans 2.ans 3.ans 4.ans 5.ans 6.ans 7.ans 8.ans 9.ans 10.ans 11.ans \
+	12.ans 13.ans 14.ans 15.ans 16.ans 17.ans 18.ans 19.ans 20.ans \
+	21.ans 22.ans
+QSRC  = $(FQD) $(VARIANTS)
+ALLSRC=$(DBGENSRC) 
+TREE_DOC=tree.readme tree.changes appendix.readme appendix.version answers.readme queries.readme variants.readme
+JUNK  = 
+#
+all: $(PROGS)
+
+$(PROG1): $(OBJ1) $(SETS) 
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ1) $(LIBS)
+
+$(PROG2): permute.h $(OBJ2) 
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ2) $(LIBS)
+
+clean:
+	del /F $(PROGS) $(OBJS) $(JUNK)
+
+$(OBJ1): $(HDR1)
+$(OBJ2): dss.h tpcd.h config.h
diff --git a/data/ssb/dbgen/permute.c b/data/ssb/dbgen/permute.c
new file mode 100644
index 0000000..b34f04c
--- /dev/null
+++ b/data/ssb/dbgen/permute.c
@@ -0,0 +1,175 @@
+/* @(#)permute.c	2.1.8.3 */
+/*
+* permute.c -- a permutation generator for the query 
+*              sequences in TPC-H and TPC-R
+*/
+
+#ifdef TEST
+#define DECLARER
+#endif
+#include "config.h"
+#include "dss.h"
+#ifdef TEST
+#include <stdlib.h>
+#if (defined(_POSIX_)||!defined(WIN32))		/* Change for Windows NT */
+#include <unistd.h>
+#include <sys/wait.h>
+#endif /* WIN32 */
+#include <stdio.h>				/* */
+#include <limits.h>
+#include <math.h>
+#include <ctype.h>
+#include <signal.h>
+#include <string.h>
+#include <errno.h>
+#ifdef HP
+#include <strings.h>
+#endif
+#if (defined(WIN32)&&!defined(_POSIX_))
+#include <process.h>
+#pragma warning(disable:4201)
+#pragma warning(disable:4214)
+#pragma warning(disable:4514)
+#define WIN32_LEAN_AND_MEAN
+#define NOATOM
+#define NOGDICAPMASKS
+#define NOMETAFILE
+#define NOMINMAX
+#define NOMSG
+#define NOOPENFILE
+#define NORASTEROPS
+#define NOSCROLL
+#define NOSOUND
+#define NOSYSMETRICS
+#define NOTEXTMETRIC
+#define NOWH
+#define NOCOMM
+#define NOKANJI
+#define NOMCX
+#include <windows.h>
+#pragma warning(default:4201)
+#pragma warning(default:4214)
+#endif
+#endif
+
+long NextRand(long seed);
+long *permute(long *set, int cnt, long stream);
+long *permute_dist(distribution *d, long stream);
+long seed;
+char *eol[2] = {" ", "},"};
+extern seed_t Seed[];
+#ifdef TEST
+tdef tdefs = { NULL };
+#endif
+
+
+#define MAX_QUERY	22
+#define ITERATIONS	1000
+#define UNSET	0
+
+long *
+permute(long *a, int c, long s)
+	{
+    int i;
+    static long source;
+    static long *set, temp;
+    
+	if (a != (long *)NULL)
+		{
+		set = a;
+		for (i=0; i < c; i++)
+			*(a + i) = i;
+		for (i=0; i < c; i++)
+			{
+			RANDOM(source, 0L, (long)(c - 1), s);
+			temp = *(a + source);
+			*(a + source) = *(a + i) ;
+			*(a + i) = temp;
+			source = 0;
+			}
+		}
+	else
+		source += 1;
+	
+	if (source >= c)
+		source -= c;
+	
+	return(set + source);
+	}
+
+long *
+permute_dist(distribution *d, long stream)
+	{
+	static distribution *dist = NULL;
+	int i;
+	
+	if (d != NULL)
+		{
+		if (d->permute == (long *)NULL)
+			{
+			d->permute = (long *)malloc(sizeof(long) * DIST_SIZE(d));
+			MALLOC_CHECK(d->permute);
+			for (i=0; i < DIST_SIZE(d); i++) 
+				*(d->permute + i) = i;
+			}
+		dist = d;
+		return(permute(dist->permute, DIST_SIZE(dist), stream));
+		}
+	
+	
+	if (dist != NULL)
+		return(permute(NULL, DIST_SIZE(dist), stream));
+	else
+		INTERNAL_ERROR("Bad call to permute_dist");	
+	}
+
+
+#ifdef TEST
+
+main(int ac, char *av[])
+	{
+	long *sequence, 
+		i,
+		j,
+		streams = UNSET,
+		*a;
+	char sep;
+	int index = 0;
+	
+	set_seeds = 0;
+	sequence = (long *)malloc(MAX_QUERY * sizeof(long));
+	a = sequence;
+	for (i=0; i < MAX_QUERY; i++)
+		*(sequence + i) = i;
+	if (ac < 3) 
+		goto usage;
+	Seed[0].value = (long)atoi(av[1]);
+	streams = atoi(av[2]);
+	if (Seed[0].value == UNSET || streams == UNSET) 
+		goto usage;
+	
+	index = 0;
+	printf("long permutation[%d][%d] = {\n", streams, MAX_QUERY);
+	for (j=0; j < streams; j++)
+		{
+		sep = '{';
+		printf("%s\n", eol[index]);
+		for (i=0; i < MAX_QUERY; i++)
+			{
+			printf("%c%2d", sep, *permute(a, MAX_QUERY, 0) + 1);
+			a = (long *)NULL;
+			sep = ',';
+			}
+		a = sequence;
+		index=1;
+		}
+	printf("}\n};\n");
+	return(0);
+	
+usage:
+	printf("Usage: %s <seed> <streams>\n",av[0]);
+	printf("  uses <seed> to start the generation of <streams> permutations of [1..%d]\n", MAX_QUERY);
+	return(-1);
+	
+	}
+#endif /* TEST */
diff --git a/data/ssb/dbgen/permute.h b/data/ssb/dbgen/permute.h
new file mode 100644
index 0000000..bf5e8c4
--- /dev/null
+++ b/data/ssb/dbgen/permute.h
@@ -0,0 +1,47 @@
+/*
+ * @(#)permute.h	2.1.8.1
+ */
+long permutation[41][22] =
+{
+  {14, 2, 9,20, 6,17,18, 8,21,13, 3,22,16, 4,11,15, 1,10,19, 5, 7,12},
+  {21, 3,18, 5,11, 7, 6,20,17,12,16,15,13,10, 2, 8,14,19, 9,22, 1, 4},
+  { 6,17,14,16,19,10, 9, 2,15, 8, 5,22,12, 7,13,18, 1, 4,20, 3,11,21},
+  { 8, 5, 4, 6,17, 7, 1,18,22,14, 9,10,15,11,20, 2,21,19,13,16,12, 3},
+  { 5,21,14,19,15,17,12, 6, 4, 9, 8,16,11, 2,10,18, 1,13, 7,22, 3,20},
+  {21,15, 4, 6, 7,16,19,18,14,22,11,13, 3, 1, 2, 5, 8,20,12,17,10, 9},
+  {10, 3,15,13, 6, 8, 9, 7, 4,11,22,18,12, 1, 5,16, 2,14,19,20,17,21},
+  {18, 8,20,21, 2, 4,22,17, 1,11, 9,19, 3,13, 5, 7,10,16, 6,14,15,12},
+  {19, 1,15,17, 5, 8, 9,12,14, 7, 4, 3,20,16, 6,22,10,13, 2,21,18,11},
+  { 8,13, 2,20,17, 3, 6,21,18,11,19,10,15, 4,22, 1, 7,12, 9,14, 5,16},
+  { 6,15,18,17,12, 1, 7, 2,22,13,21,10,14, 9, 3,16,20,19,11, 4, 8, 5},
+  {15,14,18,17,10,20,16,11, 1, 8, 4,22, 5,12, 3, 9,21, 2,13, 6,19, 7},
+  { 1, 7,16,17,18,22,12, 6, 8, 9,11, 4, 2, 5,20,21,13,10,19, 3,14,15},
+  {21,17, 7, 3, 1,10,12,22, 9,16, 6,11, 2, 4, 5,14, 8,20,13,18,15,19},
+  { 2, 9, 5, 4,18, 1,20,15,16,17, 7,21,13,14,19, 8,22,11,10, 3,12, 6},
+  {16, 9,17, 8,14,11,10,12, 6,21, 7, 3,15, 5,22,20, 1,13,19, 2, 4,18},
+  { 1, 3, 6, 5, 2,16,14,22,17,20, 4, 9,10,11,15, 8,12,19,18,13, 7,21},
+  { 3,16, 5,11,21, 9, 2,15,10,18,17, 7, 8,19,14,13, 1, 4,22,20, 6,12},
+  {14, 4,13, 5,21,11, 8, 6, 3,17, 2,20, 1,19,10, 9,12,18,15, 7,22,16},
+  { 4,12,22,14, 5,15,16, 2, 8,10,17, 9,21, 7, 3, 6,13,18,11,20,19, 1},
+  {16,15,14,13, 4,22,18,19, 7, 1,12,17, 5,10,20, 3, 9,21,11, 2, 6, 8},
+  {20,14,21,12,15,17, 4,19,13,10,11, 1,16, 5,18, 7, 8,22, 9, 6, 3, 2},
+  {16,14,13, 2,21,10,11, 4, 1,22,18,12,19, 5, 7, 8, 6, 3,15,20, 9,17},
+  {18,15, 9,14,12, 2, 8,11,22,21,16, 1, 6,17, 5,10,19, 4,20,13, 3, 7},
+  { 7, 3,10,14,13,21,18, 6,20, 4, 9, 8,22,15, 2, 1, 5,12,19,17,11,16},
+  {18, 1,13, 7,16,10,14, 2,19, 5,21,11,22,15, 8,17,20, 3, 4,12, 6, 9},
+  {13, 2,22, 5,11,21,20,14, 7,10, 4, 9,19,18, 6, 3, 1, 8,15,12,17,16},
+  {14,17,21, 8, 2, 9, 6, 4, 5,13,22, 7,15, 3, 1,18,16,11,10,12,20,19},
+  {10,22, 1,12,13,18,21,20, 2,14,16, 7,15, 3, 4,17, 5,19, 6, 8, 9,11},
+  {10, 8, 9,18,12, 6, 1, 5,20,11,17,22,16, 3,13, 2,15,21,14,19, 7, 4},
+  { 7,17,22, 5, 3,10,13,18, 9, 1,14,15,21,19,16,12, 8, 6,11,20, 4, 2},
+  { 2, 9,21, 3, 4, 7, 1,11,16, 5,20,19,18, 8,17,13,10,12,15, 6,14,22},
+  {15,12, 8, 4,22,13,16,17,18, 3, 7, 5, 6, 1, 9,11,21,10,14,20,19, 2},
+  {15,16, 2,11,17, 7, 5,14,20, 4,21, 3,10, 9,12, 8,13, 6,18,19,22, 1},
+  { 1,13,11, 3, 4,21, 6,14,15,22,18, 9, 7, 5,10,20,12,16,17, 8,19, 2},
+  {14,17,22,20, 8,16, 5,10, 1,13, 2,21,12, 9, 4,18, 3, 7, 6,19,15,11},
+  { 9,17, 7, 4, 5,13,21,18,11, 3,22, 1, 6,16,20,14,15,10, 8, 2,12,19},
+  {13,14, 5,22,19,11, 9, 6,18,15, 8,10, 7, 4,17,16, 3, 1,12, 2,21,20},
+  {20, 5, 4,14,11, 1, 6,16, 8,22, 7, 3, 2,12,21,19,17,13,10,15,18, 9},
+  { 3, 7,14,15, 6, 5,21,20,18,10, 4,16,19, 1,13, 9, 8,17,11,12,22, 2},
+  {13,15,17, 1,22,11, 3, 4, 7,20,14,21, 9, 8, 2,18,16, 6,10,12, 5,19}
+};
diff --git a/data/ssb/dbgen/permute.o b/data/ssb/dbgen/permute.o
new file mode 100644
index 0000000000000000000000000000000000000000..0f4af181e3b8728699c965d1dc722fffd073600e
GIT binary patch
literal 3248
zcmbVNO>7%g5Ppu`)}&?Qv_*>AQY}b@RM3^3=0I9N-lS<>CASi^Mo4KJHujoW^-r{3
zLrO&=a1y$5j3RO2f)H0u6{o_1hK8R50`<Zj5GtjSs*s8vpvcU;eN(?~><c69&dfJI
zZ+7<0dvl|y<6WT;BMPw>SxY07u`AJ*oD^b`#aIu!+q3^~^A$5~o=KnP3(cPS_rS=#
ze@}Qnaqk6mT(~zo+c?$qHm;(^y<^`nURw$Ap&Qi=?yaR7?hn!0olq)sGv(b*c{i`2
z+b!YSK;L?d*SB*o%Dr9uyDf+#@!D!MHFUT7J+D6|^tv$dM&!o$IrFS}&OE=$YXS@L
z%snp6L*LH*$Pcb_PgvHji}JB_?=MmP5~lY4;kCIKbN6uXaX<H^824`14~wcW{pzv$
z(<d_b`B%S({Yt%&7++X%_la*DUHEPGYhFJML6PhDtwZ6t^|jB;t2xZxcvR$Uh_b)L
zUh}$Wl!z7St|96tmUTn!-Q3%#$6MX`qta|Pd0k}tq7MT6Tn&*+1dA6g1~hFu#j0x?
z8Ftbt6v`PrXXOiaR<~UJK;_7RY&RRWvU&z=x?9$z_e3^dahcBEe%4RY?mad#azyV>
zPgYBARZk?0K_h-BQ59-pc3{AW4<3}Th>pNJ1`qas>{Oa{F}8wO7#^hEK4YP`US^@0
zSZME#ZP8_D$Mb;ilQz+V<67+fu3@dO5gyj`rHHBZFK#ilcx`Lf<!-It)O4^7Yq4Q1
zYHo)({DQMG{#WD>ztm-FeT!l6twjpmTA$f+m!6v${+S5-v$@%V-+<Q56o!Z(VND^x
z@y8)Wea3o$BaS^nA{^zg=o(5b+5;l`lOORXgp1haEoYFPl6dz+C}Yn^{K)_w2YsjV
zU847ZJqHifsD>m#L4WcSy$0+gJnj6?bil_u;Q0=CsRLf^fL{W<mu-c&PgL6+sJ+Vn
zqJAm^RIy#-FsqcSPR2F{7|w_hXGUs7yb>r4kULA4^0^Zwmvn!kiedy=#m(A|!zLzj
z(@wtR<|Z;z7nnVhcUkUDC-2(Knk+jmv&#imvF)rVbib?@=0?m6&C9D2$DOtVp8Ie(
zAWci0{ONn<131PY)qRF@`B46k132zqq)&bJz@PGb-U0tPfYW5$kvQs@o9ge9BZau8
z;DZv!I`Nyx|EPjLqTpi+-lO0zDfw{^P@ZuG$C)I2Qptm77S;cT;;+{KuHvt*`+<Vv
zEK>dtl{`BY{38Y5rQje6LNo5>qrUp!JRn@$3SiTv9s7n4(Q8s5oaUDqm5W8!ngs4T
zKA$4Vu?v>r+A}UQvX*NxW3p0VhEqlb-4g~3gK!cENCw<IR)HCra<OQ`R2Zc)+(~c`
z9ICih=7Q8;FIA0crwo^ndr>B^rzUcaRkW$x|IKxXq@Jirl2Z8ujS2-+wij&p&y$J|
zidW=O8<S1NYZ+r{-sqgi^A~Zt??4pvOK>5H2VuO&FlR9SmUPr*95Gs-^aKd#IzWKr
z6rT*>v_A1su%jGYKQ6zU&QVGe)RSOn*T0o*eneabV*hYIQ~lJxG+?+s`GF`XDc8sT
zs6=%WKMzKdRlkCaS22`>zIe}IsX_N?3Fu%`iRve85d_Q`j9-?(Wpb1}82=97?fSRg
zrhj-2Q9Sjp-P*WRpZuY;fAOqs7msIZ@ZV#6zuf=xs3;WdAD*S{;&oZ*vJ#78F#bQv
C$)b(`

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/print.c b/data/ssb/dbgen/print.c
new file mode 100644
index 0000000..932a576
--- /dev/null
+++ b/data/ssb/dbgen/print.c
@@ -0,0 +1,1006 @@
+/* @(#)print.c	2.1.8.2 */
+/* generate flat files for data load */
+#include <stdio.h>
+#ifndef VMS
+#include <sys/types.h>
+#endif
+
+#if defined(SUN)
+#include <unistd.h>
+#endif
+
+#if defined(LINUX)
+#include <unistd.h>
+#endif /*LINUX*/
+
+#include <math.h>
+
+#include "dss.h"
+#include "dsstypes.h"
+#include <string.h>
+
+#include <stdio.h>
+#include <time.h>
+
+
+/*
+ * Function Prototypes
+ */
+FILE *print_prep PROTO((int table, int update));
+int pr_drange PROTO((int tbl, long min, long cnt, long num));
+
+FILE *
+print_prep(int table, int update)
+{
+	char upath[128];
+	FILE *res;
+
+	if (updates)
+		{
+		if (update > 0) /* updates */
+			if ( insert_segments )
+				{
+				int this_segment;
+				if(strcmp(tdefs[table].name,"orders.tbl"))
+					this_segment=++insert_orders_segment;
+				else 
+					this_segment=++insert_lineitem_segment;
+				sprintf(upath, "%s%c%s.u%d.%d", 
+					env_config(PATH_TAG, PATH_DFLT),
+					PATH_SEP, tdefs[table].name, update%10000,this_segment);
+				}
+			else
+				{
+				sprintf(upath, "%s%c%s.u%d",
+				env_config(PATH_TAG, PATH_DFLT),
+				PATH_SEP, tdefs[table].name, update);
+				}
+		else /* deletes */
+			if ( delete_segments )
+				{
+				++delete_segment;
+				sprintf(upath, "%s%cdelete.u%d.%d",
+					env_config(PATH_TAG, PATH_DFLT), PATH_SEP, -update%10000,
+					delete_segment);
+				}
+			else
+				{
+				sprintf(upath, "%s%cdelete.%d",
+				env_config(PATH_TAG, PATH_DFLT), PATH_SEP, -update);
+				}
+		return(fopen(upath, "w"));
+        }
+    res = tbl_open(table, "w");
+    OPEN_CHECK(res, tdefs[table].name);
+    return(res);
+}
+
+int
+dbg_print(int format, FILE *target, void *data, int len, int sep)
+{
+	int dollars,
+		cents;
+
+	switch(format)
+	{
+	case DT_STR:
+		if (columnar)
+			fprintf(target, "%-*s", len, (char *)data);
+		else
+			fprintf(target, "%s", (char *)data);
+		break;
+#ifdef MVS
+	case DT_VSTR:
+		/* note: only used in MVS, assumes columnar output */
+		fprintf(target, "%c%c%-*s", 
+			(len >> 8) & 0xFF, len & 0xFF, len, (char *)data);
+		break;
+#endif /* MVS */
+	case DT_INT:
+		if (columnar)
+			fprintf(target, "%12ld", (long)data);
+		else
+			fprintf(target, "%ld", (long)data);
+		break;
+	case DT_HUGE:
+#ifndef SUPPORT_64BITS
+        if (*(long *)((long *)data + 1) == 0) \
+           if (columnar) fprintf(target, "%12ld", *(long *)data);
+           else fprintf(target, "%ld", *(long *)data);
+        else
+           if (columnar) fprintf(target, "%5ld%07ld", 
+				*(long *)((long *)data + 1), *(long *)data);
+           else fprintf(target,"%ld%07ld", 
+				*(long *)((long *)data + 1), *(long *)data);
+#else
+		fprintf(target, HUGE_FORMAT, *(DSS_HUGE *)data);
+#endif /* SUPPORT_64BITS */
+		break;
+	case DT_KEY:
+		fprintf(target, "%ld", (long)data);
+		break;
+	case DT_MONEY:
+		cents = (long)data;
+		if (cents < 0)
+			{
+			fprintf(target, "-");
+			cents = -cents;
+			}
+		dollars = cents / 100;
+		cents %= 100;
+		if (columnar)
+			fprintf(target, "%12ld.%02ld", dollars, cents);
+		else
+			fprintf(target, "%ld.%02ld", dollars, cents);
+		break;
+	case DT_CHR:
+		if (columnar)
+			fprintf(target, "%c ", (char)data);
+		else
+			fprintf(target, "%c", (char)data);
+		break;
+	}
+
+#ifdef EOL_HANDLING
+	if (sep)
+#endif /* EOL_HANDLING */
+	if (!columnar && (sep != -1))
+		fprintf(target, "%c", SEPARATOR);
+	
+	return(0);
+}
+
+#ifdef SSBM
+int
+pr_cust(customer_t *c, int mode)
+{
+static FILE *fp = NULL;
+        
+   if (fp == NULL)
+        fp = print_prep(CUST, 0);
+
+   PR_STRT(fp);
+   PR_INT(fp, c->custkey);
+   PR_VSTR(fp, c->name, C_NAME_LEN);
+   PR_VSTR(fp, c->address, 
+       (columnar)?(long)(ceil(C_ADDR_LEN * V_STR_HGH)):c->alen);
+   PR_STR(fp, c->city,CITY_FIX);
+   PR_STR(fp, c->nation_name, C_NATION_NAME_LEN);
+   PR_STR(fp, c->region_name, C_REGION_NAME_LEN);
+   PR_STR(fp, c->phone, PHONE_LEN);
+   PR_STR(fp, c->mktsegment,MAXAGG_LEN);
+   PR_END(fp);
+
+   return(0);
+}
+
+#else
+int
+pr_cust(customer_t *c, int mode)
+{
+static FILE *fp = NULL;
+        
+   if (fp == NULL)
+        fp = print_prep(CUST, 0);
+
+   PR_STRT(fp);
+   PR_INT(fp, c->custkey);
+   PR_VSTR(fp, c->name, C_NAME_LEN);
+   PR_VSTR(fp, c->address, 
+       (columnar)?(long)(ceil(C_ADDR_LEN * V_STR_HGH)):c->alen);
+   PR_INT(fp, c->nation_code);
+   PR_STR(fp, c->phone, PHONE_LEN);
+   PR_MONEY(fp, c->acctbal);
+   PR_STR(fp, c->mktsegment, C_MSEG_LEN);
+   PR_VSTR_LAST(fp, c->comment, 
+       (columnar)?(long)(ceil(C_CMNT_LEN * V_STR_HGH)):c->clen);
+   PR_END(fp);
+
+   return(0);
+}
+#endif
+
+/*
+ * print the numbered order 
+ */
+#ifdef SSBM
+
+#else
+int
+pr_order(order_t *o, int mode)
+{
+    static FILE *fp_o = NULL;
+    static int last_mode = 0;
+        
+    if (fp_o == NULL || mode != last_mode)
+        {
+        if (fp_o) 
+            fclose(fp_o);
+        fp_o = print_prep(ORDER, mode);
+        last_mode = mode;
+        }
+    PR_STRT(fp_o);
+    PR_HUGE(fp_o, o->okey);
+    PR_INT(fp_o, o->custkey);
+    PR_CHR(fp_o, o->orderstatus);
+    PR_MONEY(fp_o, o->totalprice);
+    PR_STR(fp_o, o->odate, DATE_LEN);
+    PR_STR(fp_o, o->opriority, O_OPRIO_LEN);
+    PR_STR(fp_o, o->clerk, O_CLRK_LEN);
+    PR_INT(fp_o, o->spriority);
+    PR_VSTR_LAST(fp_o, o->comment, 
+       (columnar)?(long)(ceil(O_CMNT_LEN * V_STR_HGH)):o->clen);
+    PR_END(fp_o);
+
+    return(0);
+}
+#endif
+
+/*
+ * print an order's lineitems
+ */
+#ifdef SSBM
+int
+pr_line(order_t *o, int mode)
+{
+
+    static FILE *fp_l = NULL;
+    static int last_mode = 0;
+    long      i;
+    int days;
+    char buf[100];
+
+    if (fp_l == NULL || mode != last_mode)
+        {
+        if (fp_l) 
+            fclose(fp_l);
+        fp_l = print_prep(LINE, mode);
+        last_mode = mode;
+        }
+
+    for (i = 0; i < o->lines; i++)
+        {
+        PR_STRT(fp_l);
+        PR_HUGE(fp_l, o->lineorders[i].okey);
+        PR_INT(fp_l, o->lineorders[i].linenumber);
+	PR_INT(fp_l, o->lineorders[i].custkey);
+	PR_INT(fp_l, o->lineorders[i].partkey);
+        PR_INT(fp_l, o->lineorders[i].suppkey);
+        PR_STR(fp_l, o->lineorders[i].orderdate, DATE_LEN);
+	PR_STR(fp_l, o->lineorders[i].opriority, O_OPRIO_LEN);
+	PR_INT(fp_l, o->lineorders[i].ship_priority);
+        PR_INT(fp_l, o->lineorders[i].quantity);
+        PR_INT(fp_l, o->lineorders[i].extended_price);
+        PR_INT(fp_l, o->lineorders[i].order_totalprice);
+        PR_INT(fp_l, o->lineorders[i].discount);
+        PR_INT(fp_l, o->lineorders[i].revenue);
+	PR_INT(fp_l, o->lineorders[i].supp_cost);
+	PR_INT(fp_l, o->lineorders[i].tax);
+	PR_STR(fp_l, o->lineorders[i].commit_date, DATE_LEN);
+	PR_STR(fp_l, o->lineorders[i].shipmode, O_SHIP_MODE_LEN);
+        PR_END(fp_l);
+        }
+
+   return(0);
+}
+#else
+int
+pr_line(order_t *o, int mode)
+{
+    static FILE *fp_l = NULL;
+    static int last_mode = 0;
+    long      i;
+    int days;
+    char buf[100];
+
+    if (fp_l == NULL || mode != last_mode)
+        {
+        if (fp_l) 
+            fclose(fp_l);
+        fp_l = print_prep(LINE, mode);
+        last_mode = mode;
+        }
+
+    for (i = 0; i < o->lines; i++)
+        {
+        PR_STRT(fp_l);
+        PR_HUGE(fp_l, o->l[i].okey);
+        PR_INT(fp_l, o->l[i].partkey);
+        PR_INT(fp_l, o->l[i].suppkey);
+        PR_INT(fp_l, o->l[i].lcnt);
+        PR_INT(fp_l, o->l[i].quantity);
+        PR_MONEY(fp_l, o->l[i].eprice);
+        PR_MONEY(fp_l, o->l[i].discount);
+        PR_MONEY(fp_l, o->l[i].tax);
+        PR_CHR(fp_l, o->l[i].rflag[0]);
+        PR_CHR(fp_l, o->l[i].lstatus[0]);
+        PR_STR(fp_l, o->l[i].sdate, DATE_LEN);
+        PR_STR(fp_l, o->l[i].cdate, DATE_LEN);
+        PR_STR(fp_l, o->l[i].rdate, DATE_LEN);
+        PR_STR(fp_l, o->l[i].shipinstruct, L_INST_LEN);
+        PR_STR(fp_l, o->l[i].shipmode, L_SMODE_LEN);
+        PR_VSTR_LAST(fp_l, o->l[i].comment, 
+            (columnar)?(long)(ceil(L_CMNT_LEN *
+        V_STR_HGH)):o->l[i].clen);
+        PR_END(fp_l);
+        }
+
+   return(0);
+}
+#endif
+
+/*
+ * print the numbered order *and* its associated lineitems
+ */
+#ifdef SSBM
+#else
+int
+pr_order_line(order_t *o, int mode)
+{
+    tdefs[ORDER].name = tdefs[ORDER_LINE].name;
+    pr_order(o, mode);
+    pr_line(o, mode);
+
+    return(0);
+}
+#endif
+
+/*
+ * print the given part
+ */
+#ifdef SSBM
+int
+pr_part(part_t *part, int mode)
+{
+    static FILE *p_fp = NULL;
+
+    if (p_fp == NULL)
+	p_fp = print_prep(PART, 0);
+
+    PR_STRT(p_fp);
+    PR_INT(p_fp, part->partkey);
+    PR_VSTR(p_fp, part->name,
+            (columnar)?(long)P_NAME_LEN:part->nlen);
+    PR_STR(p_fp, part->mfgr, P_MFG_LEN);
+    PR_STR(p_fp, part->category, P_CAT_LEN);
+    PR_STR(p_fp, part->brand, P_BRND_LEN);
+
+    /*need to handle color*/
+    PR_VSTR(p_fp, part->color,(columnar)?(long)P_COLOR_LEN:part->clen);
+    PR_VSTR(p_fp, part->type,
+	    (columnar)?(long)P_TYPE_LEN:part->tlen);
+    PR_INT(p_fp, part->size);
+    PR_STR(p_fp, part->container, P_CNTR_LEN);
+    PR_END(p_fp);
+    return(0);
+}
+
+#else
+int
+pr_part(part_t *part, int mode)
+{
+static FILE *p_fp = NULL;
+
+    if (p_fp == NULL)
+        p_fp = print_prep(PART, 0);
+
+   PR_STRT(p_fp);
+   PR_INT(p_fp, part->partkey);
+   PR_VSTR(p_fp, part->name,
+       (columnar)?(long)P_NAME_LEN:part->nlen);
+   PR_STR(p_fp, part->mfgr, P_MFG_LEN);
+   PR_STR(p_fp, part->brand, P_BRND_LEN);
+   PR_VSTR(p_fp, part->type,
+       (columnar)?(long)P_TYPE_LEN:part->tlen);
+   PR_INT(p_fp, part->size);
+   PR_STR(p_fp, part->container, P_CNTR_LEN);
+   PR_MONEY(p_fp, part->retailprice);
+   PR_VSTR_LAST(p_fp, part->comment, 
+       (columnar)?(long)(ceil(P_CMNT_LEN * V_STR_HGH)):part->clen);
+   PR_END(p_fp);
+
+   return(0);
+}
+#endif
+
+/*
+ * print the given part's suppliers
+ */
+#ifdef SSBM
+/*SSBM don't have partsupplier table*/       
+#else
+int
+pr_psupp(part_t *part, int mode)
+{
+    static FILE *ps_fp = NULL;
+    long      i;
+
+    if (ps_fp == NULL)
+        ps_fp = print_prep(PSUPP, mode);
+
+   for (i = 0; i < SUPP_PER_PART; i++)
+      {
+      PR_STRT(ps_fp);
+      PR_INT(ps_fp, part->s[i].partkey);
+      PR_INT(ps_fp, part->s[i].suppkey);
+      PR_INT(ps_fp, part->s[i].qty);
+      PR_MONEY(ps_fp, part->s[i].scost);
+      PR_VSTR_LAST(ps_fp, part->s[i].comment, 
+       (columnar)?(long)(ceil(PS_CMNT_LEN * V_STR_HGH)):part->s[i].clen);
+      PR_END(ps_fp);
+      }
+
+   return(0);
+}
+#endif
+
+/*
+ * print the given part *and* its suppliers
+ */
+#ifdef SSBM
+/*SSBM don't have partsupplier table*/       
+#else
+int
+pr_part_psupp(part_t *part, int mode)
+{
+    tdefs[PART].name = tdefs[PART_PSUPP].name;
+    pr_part(part, mode);
+    pr_psupp(part, mode);
+
+    return(0);
+}
+#endif
+
+
+#ifdef SSBM
+int
+pr_supp(supplier_t *supp, int mode)
+{
+    static FILE *fp = NULL;
+
+    if (fp == NULL)
+        fp = print_prep(SUPP, mode);
+
+    PR_STRT(fp);
+    PR_INT(fp, supp->suppkey);
+    PR_STR(fp, supp->name, S_NAME_LEN);
+    
+    PR_VSTR(fp, supp->address,
+	    (columnar)?(long)(ceil(S_ADDR_LEN * V_STR_HGH)):supp->alen);
+    PR_STR(fp, supp->city, CITY_FIX);
+    PR_STR(fp, supp->nation_name, C_NATION_NAME_LEN);
+    PR_STR(fp, supp->region_name, C_REGION_NAME_LEN);
+    PR_STR(fp, supp->phone, PHONE_LEN);
+    PR_END(fp);
+
+    return(0);
+}
+#else
+int
+pr_supp(supplier_t *supp, int mode)
+{
+static FILE *fp = NULL;
+        
+   if (fp == NULL)
+        fp = print_prep(SUPP, mode);
+
+   PR_STRT(fp);
+   PR_INT(fp, supp->suppkey);
+   PR_STR(fp, supp->name, S_NAME_LEN);
+   PR_VSTR(fp, supp->address, 
+       (columnar)?(long)(ceil(S_ADDR_LEN * V_STR_HGH)):supp->alen);
+   PR_INT(fp, supp->nation_code);
+   PR_STR(fp, supp->phone, PHONE_LEN);
+   PR_MONEY(fp, supp->acctbal);
+   PR_VSTR_LAST(fp, supp->comment, 
+       (columnar)?(long)(ceil(S_CMNT_LEN * V_STR_HGH)):supp->clen);
+   PR_END(fp);
+
+   return(0);
+}
+#endif
+
+#ifdef SSBM
+#else
+int
+pr_nation(code_t *c, int mode)
+{
+static FILE *fp = NULL;
+        
+   if (fp == NULL)
+        fp = print_prep(NATION, mode);
+
+   PR_STRT(fp);
+   PR_INT(fp, c->code);
+   PR_STR(fp, c->text, NATION_LEN);
+   PR_INT(fp, c->join);
+   PR_VSTR_LAST(fp, c->comment, 
+       (columnar)?(long)(ceil(N_CMNT_LEN * V_STR_HGH)):c->clen);
+   PR_END(fp);
+
+   return(0);
+}
+
+int
+pr_region(code_t *c, int mode)
+{
+static FILE *fp = NULL;
+        
+   if (fp == NULL)
+        fp = print_prep(REGION, mode);
+
+   PR_STRT(fp);
+   PR_INT(fp, c->code);
+   PR_STR(fp, c->text, REGION_LEN);
+   PR_VSTR_LAST(fp, c->comment, 
+       (columnar)?(long)(ceil(R_CMNT_LEN * V_STR_HGH)):c->clen);
+   PR_END(fp);
+
+   return(0);
+}
+#endif
+
+/* 
+ * NOTE: this routine does NOT use the BCD2_* routines. As a result,
+ * it WILL fail if the keys being deleted exceed 32 bits. Since this
+ * would require ~660 update iterations, this seems an acceptable
+ * oversight
+ */
+int
+pr_drange(int tbl, long min, long cnt, long num)
+{
+    static int  last_num = 0;
+    static FILE *dfp = NULL;
+    int child = -1;
+    long start, last, new;
+
+	static int rows_per_segment=0;
+	static int rows_this_segment=0;
+	static int residual_rows=0;
+
+    if (last_num != num)
+        {
+        if (dfp)
+            fclose(dfp);
+        dfp = print_prep(tbl, -num);
+        if (dfp == NULL)
+            return(-1);
+        last_num = num;
+		rows_this_segment=0;
+        }
+
+    start = MK_SPARSE(min, (num - 1)/ (10000 / refresh));
+    last = start - 1;
+    for (child=min; cnt > 0; child++, cnt--)
+        {
+        new = MK_SPARSE(child, (num - 1) / (10000 / refresh));
+        if (gen_rng == 1 && new - last == 1)
+            {
+            last = new;
+            continue;
+            }
+	if (gen_sql)
+	    {
+	    fprintf(dfp, 
+		"delete from %s where %s between %ld and %ld;\n",
+		    tdefs[ORDER].name, "o_orderkey", start, last);
+	    fprintf(dfp, 
+		"delete from %s where %s between %ld and %ld;\n",
+		    tdefs[LINE].name, "l_orderkey", start, last);
+	    fprintf(dfp, "commit work;\n");
+	    }
+	else 
+	    if (gen_rng)
+                {
+                PR_STRT(dfp);
+                PR_INT(dfp, start);
+                PR_INT(dfp, last);
+                PR_END(dfp);
+                }
+            else
+                {
+				if (delete_segments)
+					{
+					if(rows_per_segment==0)
+						{
+						rows_per_segment = (cnt / delete_segments);
+						residual_rows = (cnt % delete_segments);
+						rows_per_segment++;
+						}
+					if(delete_segment <= residual_rows)
+						{
+						if((++rows_this_segment) > rows_per_segment)
+							{
+							fclose(dfp);
+							dfp = print_prep(tbl, -num);
+							if (dfp == NULL) return(-1);
+							last_num = num;
+							rows_this_segment=1;
+							}
+						}
+					else
+						{
+						if((++rows_this_segment) >= rows_per_segment)
+							{
+							fclose(dfp);
+							dfp = print_prep(tbl, -num);
+							if (dfp == NULL) return(-1);
+							last_num = num;
+							rows_this_segment=1;
+							}
+						}
+					}
+                PR_STRT(dfp);
+                PR_KEY(dfp, new);
+                PR_END(dfp);
+                }
+	start = new;
+	last = new;
+        }
+    if (gen_rng)
+	{
+	PR_STRT(dfp);
+	PR_INT(dfp, start);
+	PR_INT(dfp, last);
+	PR_END(dfp);
+	}
+    
+    return(0);
+}
+
+#ifdef SSBM
+int pr_date(date_t *d, int mode){
+    static FILE *d_fp = NULL;
+    
+    if (d_fp == NULL)
+	d_fp = print_prep(DATE, 0);
+
+    PR_STRT(d_fp);
+    PR_INT(d_fp, d->datekey);
+    PR_STR(d_fp, d->date,D_DATE_LEN);
+    PR_STR(d_fp, d->dayofweek,D_DAYWEEK_LEN);
+    PR_STR(d_fp, d->month,D_MONTH_LEN);
+    PR_INT(d_fp, d->year);
+    PR_INT(d_fp, d->yearmonthnum);
+    PR_STR(d_fp, d->yearmonth,D_YEARMONTH_LEN);
+    PR_INT(d_fp, d->daynuminweek);
+    PR_INT(d_fp, d->daynuminmonth);
+    PR_INT(d_fp, d->daynuminyear);
+    PR_INT(d_fp, d->monthnuminyear);
+    PR_INT(d_fp, d->weeknuminyear);
+    PR_VSTR(d_fp, 
+	    d->sellingseason,(columnar)?(long)D_SEASON_LEN:d->slen);
+    PR_STR(d_fp,d->lastdayinweekfl,2);
+    PR_STR(d_fp,d->lastdayinmonthfl,2);
+    PR_STR(d_fp,d->holidayfl,2);
+    PR_STR(d_fp,d->weekdayfl,2);
+
+    PR_END(d_fp);
+    return(0);
+
+}
+
+#endif
+/*
+ * verify functions: routines which replace the pr_routines and generate a pseudo checksum 
+ * instead of generating the actual contents of the tables. Meant to allow large scale data 
+ * validation without requiring a large amount of storage
+ */
+#ifdef SSBM
+int
+vrf_cust(customer_t *c, int mode)
+{
+   VRF_STRT(CUST);
+   VRF_INT(CUST, c->custkey);
+   VRF_STR(CUST, c->name);
+   VRF_STR(CUST, c->address);
+   VRF_STR(CUST, c->city);
+   VRF_STR(CUST, c->nation_name);
+   VRF_STR(CUST, c->region_name);
+   VRF_STR(CUST, c->phone);
+   VRF_STR(CUST, c->mktsegment);
+   VRF_END(CUST);
+
+   return(0);
+}
+
+#else
+int
+vrf_cust(customer_t *c, int mode)
+{
+   VRF_STRT(CUST);
+   VRF_INT(CUST, c->custkey);
+   VRF_STR(CUST, c->name);
+   VRF_STR(CUST, c->address);
+   VRF_INT(CUST, c->nation_code);
+   VRF_STR(CUST, c->phone);
+   VRF_MONEY(CUST, c->acctbal);
+   VRF_STR(CUST, c->mktsegment);
+   VRF_STR(CUST, c->comment);
+   VRF_END(CUST);
+
+   return(0);
+}
+#endif
+
+/*
+ * print the numbered order 
+ */
+#ifdef SSBM
+#else
+int
+vrf_order(order_t *o, int mode)
+{
+    VRF_STRT(ORDER);
+    VRF_HUGE(ORDER, o->okey);
+    VRF_INT(ORDER, o->custkey);
+    VRF_CHR(ORDER, o->orderstatus);
+    VRF_MONEY(ORDER, o->totalprice);
+    VRF_STR(ORDER, o->odate);
+    VRF_STR(ORDER, o->opriority);
+    VRF_STR(ORDER, o->clerk);
+    VRF_INT(ORDER, o->spriority);
+    VRF_STR(ORDER, o->comment);
+    VRF_END(ORDER);
+
+    return(0);
+}
+#endif
+
+/*
+ * print an order's lineitems
+ */
+#ifdef SSBM
+int
+vrf_line(order_t *o, int mode)
+{
+    int i;
+
+    for (i = 0; i < o->lines; i++)
+        {
+	    VRF_STRT(LINE);
+	    VRF_HUGE(LINE, o->lineorders[i].okey);
+	    VRF_INT(LINE, o->lineorders[i].linenumber);
+	    VRF_INT(LINE, o->lineorders[i].custkey);
+	    VRF_INT(LINE, o->lineorders[i].partkey);
+	    VRF_INT(LINE, o->lineorders[i].suppkey);
+	    VRF_STR(LINE, o->lineorders[i].orderdate);
+	    VRF_STR(LINE, o->lineorders[i].opriority);
+	    VRF_INT(LINE, o->lineorders[i].ship_priority);
+	    VRF_INT(LINE, o->lineorders[i].quantity);
+	    VRF_INT(LINE, o->lineorders[i].extended_price);
+	    VRF_INT(LINE, o->lineorders[i].order_totalprice);
+	    VRF_INT(LINE, o->lineorders[i].discount);
+	    VRF_INT(LINE, o->lineorders[i].revenue);
+	    VRF_INT(LINE, o->lineorders[i].supp_cost);
+	    VRF_INT(LINE, o->lineorders[i].tax);
+	    VRF_STR(LINE, o->lineorders[i].commit_date);
+	    VRF_STR(LINE, o->lineorders[i].shipmode);
+	    VRF_END(LINE);
+        }
+
+    return(0);
+}
+
+#else
+int
+vrf_line(order_t *o, int mode)
+{
+	int i;
+
+    for (i = 0; i < o->lines; i++)
+        {
+        VRF_STRT(LINE);
+        VRF_HUGE(LINE, o->l[i].okey);
+        VRF_INT(LINE, o->l[i].partkey);
+        VRF_INT(LINE, o->l[i].suppkey);
+        VRF_INT(LINE, o->l[i].lcnt);
+        VRF_INT(LINE, o->l[i].quantity);
+        VRF_MONEY(LINE, o->l[i].eprice);
+        VRF_MONEY(LINE, o->l[i].discount);
+        VRF_MONEY(LINE, o->l[i].tax);
+        VRF_CHR(LINE, o->l[i].rflag[0]);
+        VRF_CHR(LINE, o->l[i].lstatus[0]);
+        VRF_STR(LINE, o->l[i].sdate);
+        VRF_STR(LINE, o->l[i].cdate);
+        VRF_STR(LINE, o->l[i].rdate);
+        VRF_STR(LINE, o->l[i].shipinstruct);
+        VRF_STR(LINE, o->l[i].shipmode);
+        VRF_STR(LINE, o->l[i].comment);
+        VRF_END(LINE);
+        }
+
+   return(0);
+}
+#endif
+
+/*
+ * print the numbered order *and* its associated lineitems
+ */
+#ifdef SSBM
+#else
+int
+vrf_order_line(order_t *o, int mode)
+{
+    vrf_order(o, mode);
+    vrf_line(o, mode);
+
+    return(0);
+}
+#endif
+
+/*
+ * print the given part
+ */
+#ifdef SSBM
+int
+vrf_part(part_t *part, int mode)
+{
+
+    VRF_STRT(PART);
+    VRF_INT(PART, part->partkey);
+    VRF_STR(PART, part->name);
+    VRF_STR(PART, part->mfgr);
+    VRF_STR(PART, part->brand);
+    VRF_STR(PART, part->type);
+    VRF_INT(PART, part->size);
+    VRF_STR(PART, part->container);
+    VRF_STR(PART, part->category);
+    VRF_END(PART);
+
+    return(0);
+}
+
+#else
+int
+vrf_part(part_t *part, int mode)
+{
+
+   VRF_STRT(PART);
+   VRF_INT(PART, part->partkey);
+   VRF_STR(PART, part->name);
+   VRF_STR(PART, part->mfgr);
+   VRF_STR(PART, part->brand);
+   VRF_STR(PART, part->type);
+   VRF_INT(PART, part->size);
+   VRF_STR(PART, part->container);
+   VRF_MONEY(PART, part->retailprice);
+   VRF_STR(PART, part->comment);
+   VRF_END(PART);
+
+   return(0);
+}
+#endif
+
+/*
+ * print the given part's suppliers
+ */
+#ifdef SSBM
+#else
+int
+vrf_psupp(part_t *part, int mode)
+{
+    long      i;
+
+   for (i = 0; i < SUPP_PER_PART; i++)
+      {
+      VRF_STRT(PSUPP);
+      VRF_INT(PSUPP, part->s[i].partkey);
+      VRF_INT(PSUPP, part->s[i].suppkey);
+      VRF_INT(PSUPP, part->s[i].qty);
+      VRF_MONEY(PSUPP, part->s[i].scost);
+      VRF_STR(PSUPP, part->s[i].comment);
+      VRF_END(PSUPP);
+      }
+
+   return(0);
+}
+#endif
+
+/*
+ * print the given part *and* its suppliers
+ */
+#ifdef SSBM
+#else
+int
+vrf_part_psupp(part_t *part, int mode)
+{
+    vrf_part(part, mode);
+    vrf_psupp(part, mode);
+
+    return(0);
+}
+#endif
+
+#ifdef SSBM
+int
+vrf_supp(supplier_t *supp, int mode)
+{
+    VRF_STRT(SUPP);
+    VRF_INT(SUPP, supp->suppkey);
+    VRF_STR(SUPP, supp->name);
+    
+    VRF_STR(CUST, supp->address);
+    VRF_INT(CUST, supp->nation_key);
+    VRF_STR(CUST, supp->nation_name);
+    VRF_INT(CUST, supp->region_key);
+    VRF_STR(CUST, supp->region_name);
+    VRF_STR(CUST, supp->phone);
+    VRF_END(SUPP);
+
+    return(0);
+}
+
+#else
+int
+vrf_supp(supplier_t *supp, int mode)
+{
+   VRF_STRT(SUPP);
+   VRF_INT(SUPP, supp->suppkey);
+   VRF_STR(SUPP, supp->name);
+   VRF_STR(SUPP, supp->address);
+   VRF_INT(SUPP, supp->nation_code);
+   VRF_STR(SUPP, supp->phone);
+   VRF_MONEY(SUPP, supp->acctbal);
+   VRF_STR(SUPP, supp->comment); 
+   VRF_END(SUPP);
+
+   return(0);
+}
+#endif
+
+#ifdef SSBM
+#else
+int
+vrf_nation(code_t *c, int mode)
+{
+   VRF_STRT(NATION);
+   VRF_INT(NATION, c->code);
+   VRF_STR(NATION, c->text);
+   VRF_INT(NATION, c->join);
+   VRF_STR(NATION, c->comment);
+   VRF_END(NATION);
+
+   return(0);
+}
+
+int
+vrf_region(code_t *c, int mode)
+{
+   VRF_STRT(REGION);
+   VRF_INT(REGION, c->code);
+   VRF_STR(REGION, c->text);
+   VRF_STR(REGION, c->comment);
+   VRF_END(fp);
+
+   return(0);
+}
+#endif
+
+
+#ifdef SSBM
+int vrf_date(date_t * d, int mode)
+{
+    VRF_STRT(DATE);
+    VRF_INT(DATE, d->datekey);
+    VRF_STR(DATE, d->date);
+    VRF_STR(DATE, d->dayofweek);
+    VRF_STR(DATE, d->month);
+    VRF_INT(DATE, d->year);
+    VRF_INT(DATE, d->yearmonthnum);
+    VRF_STR(DATE, d->yearmonth);
+    VRF_INT(DATE, d->daynuminweek);
+    VRF_INT(DATE, d->daynuminmonth);
+    VRF_INT(DATE, d->daynuminyear);
+    VRF_INT(DATE, d->monthnuminyear);
+    VRF_INT(DATE, d->weeknuminyear);
+    VRF_STR(DATE, d->sellingseason);
+    VRF_STR(DATE, d->lastdayinweekfl);
+    VRF_STR(DATE, d->lastdayinmonthfl);
+    VRF_STR(DATE, d->weekdayfl);
+    VRF_END(DATE);
+    return(0);
+
+}
+#endif
+
diff --git a/data/ssb/dbgen/print.o b/data/ssb/dbgen/print.o
new file mode 100644
index 0000000000000000000000000000000000000000..6f3f5d4fe452bcb611c5ec0ed707adcd5ab68540
GIT binary patch
literal 19760
zcmbuG4}4VBmB-&C0RqBIT2WCD223<jm?i-vAjM1w@Pbi;1A^<HkjzX-WHO2K2SU{f
zJ4AgQMs#gatLxget*%@Dtu9O06@s8r-F8{*TEwdOXIUat@#88e+Vq_J?wvWgnKQAU
z-p^;|-S@q}d+xdC-gn=9lb72T)Gy3)I1H5>#ylf)rl?^YH#7Nak*zkqV-y*mxh8()
zUAoL?Za8)~VC$QJnJg$Ng{*09EWQ(t6}@KS`=W+x<K#S$eAfMi>#ilxNE-K{+~04e
z3U->i4i%WGGf;0Pr{<ZdnIda_as-dvy{?VFgky3Tp4FySI%=&F%qm?<v?R23R<CLO
zpP4vT=v`|1UpL$KKbd^1)!g;fRO`U5latC`Y};RSfnk)r?px;V`JS*b22E{W4_`Yl
z2HW6c2wP@ij~k8I-q<MlW6`~UdC^$WUsw0TP|4h|2jg(<_1^1t5Aw2aS$}_j?v4)X
zz_;_RsPB3(Edv^h+AXJA9o97L0~?)XC*H~(nr{=CqQ3Y31l^J&HK+qP0w><`eLG9{
zWh~iIh7s?+2ciEZb9HZnw+9EfJ|)IKJx@l%JP}hy44au0lOQvCJ8>|DIq{XAvexGd
ztUZbT;jTNbz>%|Fx1bmH)bkf?Hy!)t^&e7mH#WJ>IxwIW>yu0iePe`ycZK~vynr(G
zOzTa)-v8j}<PtoOg5+g|X7{h-b;M}$G~VcXQHK{CZB84SSf9*i4IQ16vvfi($5jjQ
zQQpMY{jToq;FE}#xH7!ff$q1uymkJ)X4|hPbst?~?)tK%*6PLUa@Xoz0dHA)(TGjs
z{rl-ooZcenV9|Txz|kjJao>%2?WrqdvTMg{y!29#x^|qhN3#;^Ss(AbMK#qo0|-tc
zpJ|;^7F`=JfKm5ugD;~8J}_vfn@O>F?Sw^7E+Z*lp_H60Bz8H<MO!`NkV+rp`N_*_
z&6KayuAz*j#ZY8QW23N|Y>bw#kmW9>JS59UGv(`Y*T2_-dQvhrs|o27`X@SQ&w$-K
z;M~}muc&E>%eij{T<7ZIth2thK9g6n+1(!-Gh4(eHc{BI-r0#&L|)dm&PAhjiCc;r
z4A)&gh<RP=q2l$xUVsmT$9F)wYM<Jn&-mc?9m-Ok8M>tJ$PnenhbTXpt6Y}h9muo}
zW*f{gQc@^-qs<)Xo}4Fg^4aCo{qkJRL;b2D%BzPcSCf&3m$u(9M13_`x%w+})=&9b
zWw{7pmWv@S%d=lB`b;$wFFDK&yB`N{>Sl+E8(f>Wf79QeS-}Qh|2GVM{r8-SV@IxU
zH(bPpwqMzXyjZ$C9IIvN(nYd-1XDgeSC@Nm?o(&#!I^oCGd*I7AfAUnEJt*(XGveD
zY#EwJd!CsLDPLDE&y>OJ%LbaH7k=Lza@iQT92X2d9GT$ve7og%F4UO9L<ox16_S01
zrQ_AZEyn>F6mB`bNtS1?ug_q)){?uATxJieWp=suYVR_!c~2Z0XWddfy3YE%&icJ+
zjV?C*_6}?I-dGJ;g@@UJPfjSd-Z}A35~|RVFmZbxhiPT6#jgkx_6#T}u1|ILZ^I4u
z11H4x%=GWAw~m_+m?yqW9Lle^K7yTjzPaoDJago}`c&~tSg+nXk~lP?ZseZ2)OE%E
zmv{HO?s^PnQN8ue0>>wPOJRgf>p*?k5i@bp;o3L{&eaTEE<<Hod0)+_^b{OW{L!Fd
z1=2IH&iX^&K`<sBCzf4qy{ERINnCDVgM9`bvTr}@t*OO-Sq;;+x*cuXHp#9Uo<)kl
zougOSoBTwt!yea5M7UikubB9-Md`P11(~}^0hp5mXx|=a_sql}A%-SYuC=}_>#MWg
z9ynUZ<!Jp0M$38?#tMhZzaK`+Oid7@m3VK2Ir25>6%I%vOf)$luI?A%1UK`3INBPV
zEk1BljoCRcB~0sxm=czVXS#28h1qrwH|Im8?47?0i{yM2J*`g`7uQ;cnYp?T0wNzE
z?a2zgH~Myhhu_}Dy;*JCOnMo2!H|u^eXsX=?+xC@-2*0H&bdjp0_mwcXl{xN{pf?+
zpzHlde{3I8_Tl!yx2ak%L>w4moBCboooW!Wa{8ov<E4p_OnFJpa=$P4X2kECF~CKA
zGm}bhR<hsu{OP3_eOCTwBIx(k=Ctom@2t7{^+VLpx!D<XGY!DaW2tRXIg1V3*J@^G
zgB{3Lexr1TD_7fmt~{KxK678O_wbn|Mc!O~2D>Djbf)#CnH*s{#3gH{mKVCVE-KvE
z8#~iXyy!4pJ9=f2Yir)d-uOpmYDuYUtG6_#!UTnxQ?UXRD{?AApa|tutOLcmT#9?}
zvB$M_10HytxGHyxP$X`{M-p+nN^M>bwM6Xdti5K^d%#RK6q@-(X1-YQOnc!o{oX_H
z`-6P6G4s7gGTGyq>`9q*ZFPVnXPefmY;?zXU0bWi@b1<nt4qy%d$jDKtjn)1m-ZoB
zmC06TvSudRkjbvdWLL^8jBhJ?(8_xNk~m7@IV6dfVN<^AaR{L~doAqAx4|+?0{s`y
zxndaW;vDUzmP`l5^qh(YP&DLFB)y&K%V0?th`Vx#Q%f!a#YMd0uj%a+Fg=Ag4U?`u
z<LOY{@_Sci!~w7z$YD9P<R(zu#4BjGzM>1qm1N=AHE`j8Igbm+;_<GnHREm5jG_b-
zB{>xSn(~2j-}En@VLD(pBA2hlRbZhimxaaEX1=`@wCsbwrj}UnFE&lbE;C&QflgcZ
zFK!@bsl`j7t))5IN-b^$g}joO9^VLx8*|zSgCfi;>;W;1jz}OFiF#tqA;V+T`h1NI
z@N2qJ5-stUL_P76fTtut&O|m43<YCBS||xf*BYIXws6ejH?HgqhTSbqZJ}Vm-O>?p
zmqgu7G5DTS5*THaOuH~@lvGrPz&L(ph5{w!v+OgO!9q_-xs<4!-)(d>3ZGX8*BPNq
z*5A?I-WGGO?TD<t1h>6ICEI0pOQfS6U0d54j07>$9E`0Eg6H6tyD1#NBidUufBqbI
z>9Xc{I2LzTRCs21%BNMt#c9P4D=R(aGs+~E4daco2tGK$7G1H-fO~T<GJGh<guka@
za{S;b!_igb7(aSM;oY!34FQ`VN(|#UyrD027H!J&I>+6f?{&Hp1&f@|IP!kt+~%lp
z_Bw8G?sa&b`y4@l?W3G-ka?X&HO@lsX?Y7l;{^?tt%dwbnQw;te3=LHQ{l5owznWl
zd--b}qcT=)-~L1DSKxMffwL%)mp=<s(5hjS0{Vk2e>|JLhw_v;(a%D(0cmr_R#@4>
zIF!or@=W<yRlXF;@qq@@5ezKEYI9zVbKG6|HBNV`pvGCcX_(hpe*18*vnny7*4dDM
zp~JbN#<?<Y1LU6rJ5^BZb(Vr0s@2HYVY`n=yK(G;-N&VQvX3~fy|Vl?c|`U*nQtTP
zh|GTs<(N?SwPe2+#AZd<4y1cznSD0+8;c8!Px}X#;M~eyIUj0dOZf)=@H8;thTj2t
zQ{qB-#NQ;4S8E)9@j_lDxoT;O<mH;aOmdtjn3R1F<k5~BAtYsIHsq0y6Evg0RC0_b
zCZ+d59`*Pe3ns-|Adg(Nr}!GlQHx2H-zv_;?Lp+!*WXGGQ++_XLvnF*ASL#-HO#;`
z(R7bE8D?BbA>_A6Uaav4CI6nrACcUv@n1;3MB`6LevQVTmK@_j)4z(7VaClALjK<+
z-=Ohbl6PzT70EYi{5O){ukqI<|GCEBl>8};zbpB_X#9W0xf<0>;Zdm<e>})w<5S7^
z5{B6m@cAyxAeF@03@3Sj)Erc0cpSy>a~SSsIR4csoBi)I9M{8a`l}g!9m8V`-^lQL
z8U7H%A7%Kn4ByJ|SAgTVsPPc@Fo^$uGx~!JKg96E(*8-gV2FDj*#B7a2DyZadmZpE
z7(3r6J8~+D`yJ?q!MczB-yt_G;ywp_jO70&7aDPY1AZ3pY~wqL;TJI6!|;oNqd))P
zg2o_;`yU+VNj?^TF@mJ-dy6EWbh^#ey{|#?JLN`4-T$tUe2sV<HVkzSY?l0TxlpF>
zgK9t7(&?!vH(ELyL!Q}2s3{t2Z0`sJJyk|$BNWYo63@jp569a*GmQX%8AhaIZL~4g
z+7@k$23NHQVP`tsW}U%E8dVyRV6-g|ZwfV{-cw-&&}O;esh=;tDlk`xU(@kx7JkjZ
zubKFDF@9Cy*KGU(FY#N{sSuSaM6C)@qe4`v5Opeqp$bv8Le#Akbt^0EU36n-B-m-h
zI|EIzVAN;}M}v`=ayn|n0>PH35sgLs?VYO7-pwnG5e%<s^ml|?+Ey8jjZqQFmPUW;
zYNMqCcIT>6s4X09iv`=$)$F~oI*l3w&kcB1XcYCM;6fw<rdx>VU{_lVTZ=XMS7Td^
z_zYkKnpZ(@;K{)7cZA~Y;id=-SUeUL-+t(Bqdy*v;Ss$tTKu7oXb?|3n<4^7<DH#&
z9*8uBR|R3fT42mtja9*LWAvtw(XuuIT@a^{@G2pMpc!i-&|fr+8DSYS!X{>fQOsb3
z?fDCTzz_+hqk;KT_^T_Vc{av<yx8L^zMOE}+k9=y`<n^J>s9d^CCBCDLc+TwNBb8M
zo?!To2>%Yz+xrT*F9E|#RkdsHEA0Ic?)eq}E#xtw9xq|V-$w`u{hS6L#g8L|gnGQR
z75^L|B-G<xPVtGb?qNbb?p+iwV|X>;G@h3cPWyWm;k3WYB}ac~e?vr%OOWzsJ<-$t
zs&yLuFNcpB_xp*R3S#GH41b((>i^S(Q~#frT=!o+PoO`zR4V_~Glf20qfrP6547FW
z38(FjlU#4NlIYI@TdLiKL{H;(HPO>~axKwU5<6W)k9&P(|2D$ueE1RJXA}Jv!lx7d
zh~(I>8H7JU_)Nl|Cj4T;_Y?bQOZoW$;Z?xZJULGIIYgf)e+bb1aS@KsKFa=h$<Z9O
zUrP9RqMt+T(EipDJwE3u``0r1PNJvn-by%acQa$>VWOw)KEvq6+Y8{I-rtvropXu*
z2N*ja68%J?|ANuy;ebHGcB!9ZCD+GmJkitmT*vSh!Y4r+D*j!P<AILXM#AS1{k_DF
zoA6%{J+<>J(NjNn5I&jM*~i#<n{evSCk(g$<Y31WLp~oqm?jOgj~Hj<wf3n|COMj$
z1s@fkOBsC~qhCh!w7((3X}fC}JDV8&7DoRoM!%KO|0kn=gV7&j^j|P~mwZ0a<9xQ{
zdYs*i{z9Us<6cep`DDBnF?LpJe6Ae#)f&gSizy;G`ibwAsQK_i!p|qXTeG9=CpE6@
zZ`Zie?~z=O&p}52A*26_(HD#Wry=R~&yie>m-45W(a&M@wT%86M(=0z>ll54(LccG
zA7%7CjD9zxf0NOF!05kZ^!f5RT#x_RlI!t5kI~O&^z#}0HH_ZR=+`m&1fzd|(Lc)Q
zdl>z0M*k|K{~giO^&x+xJ<nA<@!1B`B#q-99aEX)`nb#_db*z16OQY-vU4?KC&cL2
zF#1i5ehZ`j6{FwE=wBpy+OIbWr~UeXvGWPhQ#-@u?@B8EDn3P$>v2Ax(U&v&g^a#|
z(JyE8O+-)U&sxIIhw)9X|BRi#)A)HfsUSV2@nRbr+lYO--}o(~KfvfeBzhYEQ-tF<
zs&<FT-}UtPkC$AHm(rIo`b!x70!Du=qYpCrn;CsKqu<2nw-7xY-^U22<NFk2=b*+X
z$Z<TZ@v)L0C-&+7!7<tvVEk#jg_5iID?4XtT(x_y#^o|-%p}|kZK!#B58*Yy)pO!M
z5svdjwfmgp`aIdo==U@FHyQm0L{G=%6wyy1{^yN>6G#}JGWg7uzf+7P9IrXm-!llO
z{hA~>_6zr7N<WS0aqcTVgK%tjlWg}A!fCsU2&e5Xkz8+g1*2~xyoC4@CU)i%ek;*q
zyE2T%?F`?faW#J))3}-s&q}V(hi#1hMWUzU`vK9@`FXk%oQ8zsTMHi*|0#sic{_`6
z{C!O67Z8qdklQfh8jWK=Fg0sj#i3Q>N*^Yi{{Hp@!fE^y47W6{+TEgY75@h%M?dI#
z@+9GOJ$at7vq$4<J$Y5*%AbQ8SN^;!x$e)Wgj0V`GIk2(-+OS}F>cBqr^b~(=Mr8E
z`t-OHj&_uvS4xh4()(<H(T5p*oYCJ#^mLv-q3P9n`?AKB|NDp?>gQobf1J?|#~=M6
zsdy^;MUv}rIG@p%Gx~*$zJbxVF#1kLe><bUo6-N0(LclJUuN|C82w>Jf1J?|#~-`1
z#b5G4@n`hqjD9ZB)BT>0aGd{Y+^=WsL>c|9jQ(Cm|M!gkIY$2iqko;zzs=}BWb_{s
zJ-r?a@Q)3UF#dF1I!$u@dYMc(-uGw7`~F;p-@x$o4F5-l?_v01hCAioztMl{=VcmK
z?}%N+@RbZ-&G0o0-@x#nGW-RG{};pGVEB6sKgRGA49}A{4i!%ow=oPK$MEwQeksG7
z8U7QE*TOhsdWPZq8UFtaKU2OBq57-rmoj`l!-EV@X#7LledA$<_i9{Scl#OsM~3$^
z{2ck7hw87gKbPTuqj7u>2~(Wm_c2`ji!Rht?J7I3G5Y-sKg94;3?FlraCcC<V;Me$
z;pGgU$MAZFuVT2x@PB6bZy0X(){ZmA`Vf9W!gU`Lp5kYM1ZrQYxEpVtkks>&;_B~_
z^0uG0Gg{h_msgrEmVBITF3n$-+^unWYTT`H^?q5e#?^aahcvF<3mYxxsq+7%?C4aD
ztM|X^HLl+O`nkr{`&oN5uHMf&pmFtn)-jE%_p?M`pbyGV^?ue^1ca;ivkXskU3;vl
z8IG}teQZ^^NHEmoi3Pi2h6i4(X)-*`(Wv2xbl?ffE8q<)j}X~xU>Up)g)j5NyIO`P
z+yVb+0I$za6R#7=^Ht%vr!xX?AVp&9q@7@EV@sr|J!t#-*L=I7GKOflW~5r4za!_n
zDwg3?rw0Bl9H&-wn%J3g{P6&iIoUqqV;I&|^IpwQ+!G;}(;sqnny3T{$fMr=RN!b+
zukV)i8?^eWeT6F^gElJ_kVjSj4UMbz)o}roV_r_1wECoMUp-f<0(lv#QxUgp@#|V>
zOCFa>j317X@?VALa$wo~ABQiAK=Rm!^tY<7{J#!LFkhZl*qMd0KE{_ORZksp3aUKS
zMPz--Ft2Ol!0@*cJ$_xXz8lJDQvNHJf(+W!>))^HYXy4!JE3m2_`4U`EmTQ|G5)wV
ztNJSb+1kdM+wj7GR49K4!pCNu>w5i>!1TEb$mpGd--fYT^Iwl2K6YfQzrqyNjr;6F
I`dhF6XE<_@r~m)}

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/qgen b/data/ssb/dbgen/qgen
new file mode 100755
index 0000000000000000000000000000000000000000..2a11803792d98fb9b5136bfb27cf16ef12225fc6
GIT binary patch
literal 78240
zcmeFa3wRVo)<4_{Gz26jDuZ}sK%+qkhN}cb5@4V~5+WfAt_UF+NHiodnII@P9ioiG
zD7%Ww>Z+)ycw13H0}3R7NmSIJsPVEoUUy<}HzK}9Wu5=;RMqL}X%cY%pZk2@_dbqh
zs_Wd(sj5@e)iup>*OUpdF)<o{I%yYcRC6XtNV-A<`$uTfwGmppmag^D&eXa9-x+^Q
zXD^##fy9T)I9b&?p(MWEbkR<Ex&&)nvUVzoucJ@$sS?AbmMH7Qr_-gAzVLC1Z?oLG
zQr5Y&@(F#L6uqq+aGHd3Y2~BXEZQsai$01j_Y9ZyRT;8mJ_;A{tx|le6d#wGiYJ#G
zPqLBzMyPn&%T(a#!=;_CQ1RKz@sdsBl624~=Sxq)|Bb$E#TVQm<=9J)@{dc_cPaQN
zF2DAZs0gPieI2#KD&;qq)KBPhL2=RC;X^MdE=VaZDydwQvS`Hcl;J}MR+J7LEbwHP
z6MxjFCT8VQbJP$_iPK_N$U4g-Tu+53`l8!!KkmcRb`N~C@37al?CE`3`K-1^q9MD9
zhiK?S^0>*lJ^&X)BjTaoh|1})1RIFI3-FhG#_H4ex!+ijcF=$N;5|Q__}qg(oCt8c
zpS}>(9<mtJ?ZJH=z*9Srb4dsA<2rx?tU2*#|GWi2d-gou0sQF>;7dDzzt#c#qz>#{
z)B)Vxft&|BfIrp&d`buKdpdy6?g0Kk2k<ihcjC|fSqVUUalW<#_?!;lojQom!Vc&U
zcL2}pK+dThz_)fF=kyNfDGjyf@B1CV_jCZC&;fjN2k`M7*pu7={K^jC*LDD>eBi{N
z{qqk1+Vl5&z)#Rl)zYcM<HPeqHm>^#Nc7imkP-a~8~tbnHx!(58hwKJBl<ZudTsWk
z>6x<&+~w~1MHL=*`Si?j#ib?g>3MUD-P-Kg^B0ts%&zd{m3wB-mZa?x&MPi0E!T=m
z=Yu7GQQqu%MJ0K~Mc2EvdGp<#3a!FZo?l4#{JdgM(E_)&Ag>q|4al;h0-*CeU_{2x
zSGpGD6_sf7$|^ni_%1Ih@yrvo3N6o5I*-1J=&M+pS6b#SAu*+89&KKJacPBHC@Wh`
zg{Sly4T!vQVU}lcnS1u!*`&9^Q&3t7hIz%66@?lEm0ssACytVQ$yp4k^9oqFyJVqO
zq0GpeTUzcR0q*j05?)k7_Lt`i4*)AEB@KC=qS6x0y{L%%Dyk^WPfMF!QIVfl0za-R
zFY-`m;dTBsv-1nDnLRJBs91yd3ut;9PFMNzYT}ehW5>-NJaF*9VVZfJW?l|4uP!vN
zhD3lvBfz0({aF0hNqom*_M#g9*I8ZZs*J(R#<*CmtK~Wt{W5}y(Yje`P=OCg_Ms2b
zJerEX%2SKFQE$Fi!F+$b3)7~qJ%&1cNY@<-NA;76isCiM-K=0Ii+WdWhpIm(_`7Hg
zs@_xJJ8N&N`euRer2SphrT$o<pXBp8kjIyH4nZ|-g<9t^yhg3R<AMThjz_LH+2B`i
zl60*$c!`3?eHtyNT)~|-_(BCAVS`_<;Mq3#G6gTR!EaLV6*l<I3ck(;zg59E+2FS+
zc!Lf8Cj}4M;CCsw_F1&QPbzqC8~k|%Pqo3fE4X2U?@{nMHu!4_zQP7)y=!dnMn%8g
z2LD*WH`(C-Q1BWX{Gft2+TcGcc+dv#{EX@+Hh8>(YoABQt(Sr)+TeW@+-ZX+D|oUE
zo~q!fHh7wXkFdeVD0sRJ?ox2W2A`zh**18Vg3qwQa}<1z4St1!7uw(j3SMS|FHrDB
zHuypXUtxnUSMXIf_{|Ew#s&{4_&OVWt%9$&!S7e_jW+m31>a<YKds<3-D5<<bN*TM
zqU;B~X`zk}!*5V<rwzVL!IN$9l?q;FgZmYHkqv&Qf;ZUU_ba%wN3=bUD|oUE{xr+)
z0KUQo-=^qS+2A`AywL`)Rq!Soyg|W}<D>0tRPa<A{5=I9VS|67;45tK|55NJ8~iZK
zKR#M7+cUxj@A{H#-&HpF@e01i20ux`*V*74=b#PVThWJXaHoQgNQkzxKkK!@hq4?S
ze2jv(+TezQx7pw`6kJP;*82wqkF&wAQt(6@JWs)U+u#KX?zF+@D|oUEUZmiuHh8gu
zr`zBq3U1io*D3fq8+^Hfx7y%d1#h##S-y5cbbMAT`ZybWt%4`o;D1)|-ZuE%3U1io
z_bPa{4bJ^)h7G<!(XY3`pHT2N8@yV<wG*TLdQrjSY;d#PZSWn6zPAlttKd!>e2;=B
z+u-#Io@l!dGSq%pZyS8Ff;W94$1Cmmbds0iJ7|G-wZKCbcrOdQ)dD}y0&laxQ!H@4
zAEbRQx~Jq1-`^3O_MG^`_jd$OWgtqvPb4_)Q}Jj0$MTA7mn|gg8!hlyCPul*0_S^6
zLRVYhq=P>-7C6O@KMfW*&uxTjw7_xWV1Ak`@a`rQ_q7)IaTa*U0`Fmgw_4!1(J()4
z7WnZd6c?&pC_V`mc$@{EXn`kM;3rt%y)AI-{lC)!KgmL$Y=P6hC4W*aa0dfXj<CQ_
zw!qUZ@ZJ`<VS%4wfoEIbYAau0XIS8;S?K3j;HO*Qg%&vNb@8Xn0zZ?1C>L4aeJt=5
z7P!*_UuA*!wZPX{;QcJ{br$$p7WjG#{A>$+qXnL1fp4<F)vcqzR$JibTIg#m@MH_T
z!2<7Zfj3&<=Ud>vU;Z9}-y`sQ1b&ae|E>spmvH7!hW9|6;n&~ascA-4jVCtTXn6O;
z?G}!Phh4ox(+-A{u0fH|M<e<|!W?Y=G#n1E5p`O)9c<ob)@i|Zu({T((}MM2^D}0h
z7H$WdA292*U_03SN3%`~wS&z!nsr+I9BjVUtkZ(+U~`^Xr-j<V=1a{wEzk}&XP9+b
zm>p~$WY%dxcCfjxS*L~A!R8apIxWBsHpiHCT6i67{`MF4mlj;`->lO@3;vsRT42F{
zvrY>u_;1!}K?VQKIxVE&zgedRJN!56w4j3jW}OyN@ZYS{0t)_{by_&Vf3r>tCirjG
zX`uxF%{ncR;J;a?g%SKW>wQH1+kdnFPEr5VtkVJr{+o4L_`rX&P75CRZ`Ns{1OLrB
zEpXt!S*L{!{O3A)U*>k|ed9IJ|4{!D+a1>W-);4GZ1vY|^_Ok+?Y8<BTm30p{b5`E
zUR!;wtsbz|t8De9wtA(lzQ9&@+v>Ay^;}y$(^hxc>Z5J-VYYgTt)677pJA&zZ1s3s
zUANW$y)C*uzqi%DvDLq{)&Fj*zhkSvZmYj+t8cf}w^-|gzM16Pca`D$%<z8Enmye$
zsAkYBMqqR<x?|XJ2PhBC8<@~%B|0q<Xkc{nH&p1KQIXc>aUiuW1QDgyZsA};pA|&6
zTa}>SD(HtjOGW=*3}35J`_C~(ZCfWJrq0;+i>D_9xRijnaB!aJ_smcEcg5)K;MFS6
z&o#WG`+-t3d|!Hc82-`cpc2}8I2;ZYAeq(aYe5$?3%sU&;%~kVNV>=cpW#~?GU{Dz
zYi7r2hCef8Y@0t%ehgoO@x#}KuMxJz8nr);F?_EY{TmHmt+8!%Vd_tMVECz4n&Svx
z`_mYrDQfJSMP@ZnCC1ezCBOMY&gT!`v&U)W>T3AjFlv7RSF7RMWfbiSVfHZk*OD+&
z9gH<%8jV2DDvBaq3J6JbwawaXdf`2g4o{j4UnLC8Z74AOos;PP$MCt@2Gtn;+y*1y
z3L3tB+x`LhI|>A<K9p^ML?S~+AcsVdux;PA1l(xt5a#*nL%jqtv&q=zO=-G`{<nJ7
z*%FIu8m~}+2xNDM+3+TQ=_)eq4Z~MoO@4)3$bCQsVGGccJ5eYC`jvvJ&hWh{srpE&
zkfK89h-$cv>PNIcWVI?PGz(EVY*cScDrn2@UKg7p?DE!hNqGLmRf&SryDK)~d2Lnf
z8lc(=jPqqYJ$mSrpTpr-tdURU$R8PXh<Ay=(v(lnRN-}f=z|F&>Vy=5jY4i0NLN$6
zmN{YKw7T@pXduY&=QbfQh$}ghc?$`O1g1;GWunq0)wwpRAPhgkn@fJOdE*o#!dDqQ
z!W+>%@CNb0Wnd8&QRG}L4641E3?kFyRh?|G5@-~}-kM|(ZabBtK(qPIF8B)V!^8~7
zg1)ii{d&KzHO(7}G4e%Nww^|$_!9wp%teHbt{{OpB#dBZ8Vde&n2;MZyt`(IXuKCX
z<402HI8zYDzt?P~Z!rAYk(|Yf^Yb=kY9M=tGTAVE?^1K!jrEHF1&qa{ejBYPWbX~#
zAR`|%be{n3+Chs6#36@F0Q29m*bu>hrPPgjieZUv^cECGbt5MN--ZE<E|H7&)|*>2
zZC=8^UO8waoY!zLAyH36d(KO^pmyOYkOhj&R#*cL8b%?qsik*2h{u6gV^Ku;4y_9%
zO~NS>rJB@R6KB+eT(}$Z1~I@yq6vz~Awi>ssp%Ct!pV8b3Z?GRWW*dv`m&N75kRDE
z5z;lkI-&XX-T%<Es+xqVM<8@3El@)l)Pg?!Ss)O-C#a~ZsT_psTX4Pg1#&CkNXEQn
z1cvpeg0s&HKZL_OX>f*KhYTt0JHQ~VSW0tSq@|HGYEnr1z$>JENCitPGNmntw0hkE
zdNK0rU7@bTR_|&>4kjbJkk~*rdU5>EUl9a>lg*hlo2rg41yuh^6eo(n>-$8-@fH={
zUEswi@Wnz}4M741Ao><uZi^!&Qrv(-5~tr6p}2!6I!7qhVM>6)B}C!ZD@CCc7K(}^
zkBTUTQwbs!ju$kTWu(3jg!%fw2*Fvnv|5xb_4N>Zx-JT(ukA}=%C}TR>3dHgsMCau
zzUW{28w8LB?50bpe3n%HjG`79SPbOqt4OfEju6z>{fROI3*<clN#hDLl<46|MAS5t
zqmP^gPWs+9e9JI=H)3S1$H-b|%boSEmqZWn>t~a0bju#0xO2&1{UibeTs2$c(H7h4
z%3`z~dGoOp!H|9~9Vox!+b^ha)gX6uGE(A39VeCn7<~TR^@vYTmB9KPLWlJ(U8i}R
z{?S{|v8ofH;V(agjeJ@xtTX;%;5v{U)1uRoiA1h9{Nx|37Ad4mWNXJvku~+Jsc$Ac
zjW9z^;QF|&$;9N_C1$bE$*`?uyp)k0=Xbqitdrvy?o-elCrK%1ODT<#5<N9-mys}4
zRzdi~td}GPy>{DWx$;A+1zLE?brBXpa%mLv7HN=6Dy%KNg3(*kmuxzRYzycYQ>YNR
zLd-E}=trqn71hv^Wt&*Hq%|4|nKZvNZl!x<b3&dphUi`E>(eg+SY3+83>d6I2BFWS
zZ>3G*ZC&wYGWQ_ndRG&XHBap#*+pd07iiiR%Q&oVu@bD2D>zhIA<dw?>OVpDHqat&
zzlue4%M3Z{T}|Ga7|g-P<jnR_TSE&ah-0alVGy-`6#|GU=q;Lp8V@F%kh=!JOVMKi
z{E&c6^NfUL^i@(5$~YJfQ?*7`$9@+MW7dkFy8<`22{URBCd_oL0ndQe4ucZN&K4#j
zQr?=xt+b6`_zs$ZerPns)=@;Q>epWtZT&8~=+Okui3p6_O8XF2fzyP*4q7O%28KhC
zYdy6Y*+bhT+XmyywCIGR3s@f|AZ{z|t8D8|0`Mm~Y9snv$KhDBiO4J#P?r=*)D&>L
z5C9tk2JV|WawXdmhD0^m5B^Imc!;y_Q8>#Y`r{R6wc_-rUu5Jr9IG?wSa`$NIE=c6
zt+dm#?FxL!WfF$!8>2)7AO;x@f%ds@>G=02!tn-M&O@K<$i+1^q{fUD28YOLJ%p{l
zlF31-gf_E~N`?ecz`irY;xBW3hniy=&^ZuaLH)?Lv}(Zg<UcxxzD8I}2^Pf19=`fI
zqNg!=47q~t*oj2nDD)>*V;08UCeq^54aT<P-g~HQbocijQVBTFxjSxsyF}mm9wT+=
zqRWz@w5;Tehr9<Lf6blsvBXz%99--D>50Dj|L+H+Mnswx90k8F$}w#|T05`zWhWY5
zS5u7W)c*8zBhA&6u=;+w!%?$M%Y}qP9DdC(T(2T$7rEp`iYbOp^zF%z1Ge=|YKM2Z
zSTBiTnFwnrQ)2AZZcBp99b%aq(C7bMj-?*BNf9)G?299`V)d02CroJcjh%u-5u~3r
zXxns>!8gqhUnrP^(1jWN`m?D=1@y&>NHpDeR*EZi<M}<Thn&#Xyd&hyl5#LaA;-Jx
zZ1f<=TYSEhcY#&P(^5+7R<dCmt$$2wPL@(=>LDrTD=9;yl;-QmDrHL%Ns*DsrUXMz
z3t(^gM?_N}dI_@vN%4&>M&c_VD2V;(#aK~U?sGx8=Id}+bd5$|HfTkx%eGRCqi3ZX
zhD#Y}ibbTbvsBnuDZB&<+imi6$hX`mdJj1NZ?*ysE+oIGMdX!=LVk;gLJ&svqyFK9
zQI@TMIy|F56tER=kzYUggQ%^5zC?;Y5w81&5uxKSez1ZNw6hg}7-M7Pz0irxq|k8^
zWa0K|Zxnq)yITPTinA%COqJ^qGC5ss1>BBULTwO}=C!2Wx)o4{c5XMWA;^o0t2vY~
zkVC>Ty73FF$?qeiH_8QJlcCR1cLHhDM<|1~zk_Jmn?yuNmdP2$qU9)@*kn{Ymhw)&
z4Lg@|hfnNW0@$J>31M3%DxcGxh-l)a50k}1arfzJ^#X`Hn?GPEw01a`_}(;X!`SR>
zBYv?kZf}3Hzu5k)ddqV>?r1_W%eq$62yD@1c(Ha)bG0Sho{w!@C=TJK29qw&HKuK!
zZXnxwl5K5b-?r|*YGtg=%Kqqu-$Emj3+<GN8_U*7xP9^Mld(I_LrtTG$9UQ2TG0!N
zad#p3eA%?0(`t1smRQMUtds(4s(BC7m+b_!4ZSkf*p@WuPiNErY8FRe_O)5&aNGeZ
z>B&M;`@!4r(EsbfJE+V|6|EjMfD+BEGT}9vU)-f*0&;x*j))#S#vY+Jlt_{C^KHDi
zGp%ip0<3hOk13Z*Z%yy5{n6vLRp4s}W-T;SeQ5tBBs#a%chhD-{v)db?dh~y1*P@r
zw7jRLmj;}NO0)qv$1otZgDJY+^@Wm*o6-7^@df4)TBD+GqM9bE2gD49+_9}286j8C
zJ3bGGy-N?oE$`=d9q_rnXzA&9eL*|4lw)EV>olj=^+k+#=@)U!zQT0dx^*tuOajb&
z{m%=OOSD7?OztN8{+ZX3j>rbK>q{||n-KZ#bsbDk55SnHUf|^Esor&n{KHcmSsbRK
z<t)GJkhGPCi$52YSJ8YyBj#1HvF~*qiisit^de{FD<%w-a8V<=mUB(06xJq%p~Ax2
zV%ih9vZnH7>Z2lN#0r6kUXTJ`^BJ|nVZZBelv^FOq6j+ekQddgxRiE?L=%P#{fwD;
z{ZYfDB;_`JieUQCi_nw&>AkUz2pBOP<ORzFu*<0Ztdr4SZL!d<0;L+PHPWS^L<?vN
zVX!78b0EIw%M@`;`DZ*x?sZ1a70+=_6L-6^p+nb_S%Phfo$claqS%tGY$GKbJn5-Z
zYGkWYg;m)-NTe^DwxVEA>NSnndZVi;VgnkOp$a2R^6pAfE42|~`S@@t6mt%y_7KEn
zBpWgyRIKipCoaR+D%dKmY(8d7Pc3SsJvU{*H8lm&1_(kD5<LI|E)i%rbGpjIzHCF%
z8$?m`)*Z;*M(x+mm|e9$#n!g<@zweo`!|@x*^)Tj+t#&GJWQ)gFC5f@`Nh?UD`9tC
zdXY3i1;tQ}mFjx~oh}_SskbeLnXgV|=Bu$&PC~9x_CrOCr&mOsN2x0@)s-l9rBUkA
zp$;o*Z|!t3zC-A$0*|3RoqGMNpF~!%vaPuBn6X$g(i0##;oELa)<J@0#NdI3uks6D
zZm`}Z2mWM?D7AP-X9?(Q?$b2%$u!p&mFL?~iy0NHq;?6-tJeler7gX^uER)ehr2D?
zq%f%x6M0DCTYA7(8S>?Rf$WIY4;DW|{J95=d_2aj=~LBmsZkqp*0$-seYHRK>ECco
zLjg4<wl3pg)OYt#&%^raI8+`{6`I*;)gOx`&0%Q8vyn!>MutbNfFc&)D3g=2AO}F?
z!IpBBtfJOL0f%-kREjMY+7NE@r}u#5B^mO%C$5(dlh<)+uHcgX@;VXM%g>-|+Ab#p
z<wLRO{;pQLExjo3d6ypQylfTWIelAN$a^3$YQjb@I2-JSPe0=g>KuA6D(JibuDj#9
zr86Y8S|r?fU_B)m-}XYR`Os^kA8uC>l(I}fKYR_5si(@flafGHnZPR8&wwR7FJK8f
zYP@ZWJ*VJnc|UJkiKiF7miP9yWqEpd+h$fCFISztMJH@I0n#6n(nED0g~KhkAUi}o
z(bFK>Vly7e`<8y>t8DV+?l*jy_Gj{b*M8i#(?puz=y!c&<YVn8^Y@Fm)5M)YD?HYk
z{p7b;Q6G6O0qZ(j{>EZj<!@Sw_Ypd<QlwVfw+&%fZqCb>krFhQH#mmw4``j~$JQIz
z1NsYp#p#siJIAb|wR1@90nudRb+yvcF^JW7Yr^XJ6jd>k(KG>0b854NrTIf^SaQ&1
zXP`G!`zkm4ayR)hpPb@zt+zI5h|ViyuJ`9Y>95@6cWp***@&ZAMj-d^{>&$#t6DBj
zycC&$>+h6QDxt&Y+Us{cY2-J7WIX}r0O-%%YvjL0%YT}4eDIDIJ@1=gn2LPucY&r6
zT)-d_V2;7o0tu!CN@(i`^09fydI+E=WE=ijZPD!wznXlNXze3UV5s(#tqvuyQmp@)
z{F=&DyjV@7_A)mb{&o)C_;Bo*D{|i}$7I^BB_COYM$keB?m5IJOZa`!WN7K=jyn+^
zS{zX)VyOFJWpyQrmP?RPWQI!-3#nsexTticFQ6#^&xc%X<`M}u&{>;f-Or}gEvY&h
zWu*H@-M3O(i&HmIXJqJn3<W+#K43#f#_hMb{#CZ96_t9RWCS(|w~(jQjqh|hOgq>_
z{SR+^qXI}zi^To8xQV1?Dct}sJrmxv8Ryf;tcfxD!=D>nA-a2C(|jHV=NdFKE%J#z
zGGpWzHy$X&78G*n#($$b__nMU#oBc!?8C6WssT{y2u6KXBVYRUy4L`qr|(Y@Dxm*u
z9touU`-r%z4{117BJNcKLQ9}32yWc{ZpG~YIEMKook*>riPSG837)A}3Y6%uZ#0pV
zsv7Vb-tAOq302R*L(CJ;){~&MBsZh_H0oi2_=|`&Ff83F{Db%S;eIPg^XuKn_<-a1
zh?@EBshJ|wI6iz8)P8+Dv}IH~rJ8r#Le2YB@LW@2s_~ID>CFy$*h8u4`*(#SP0|r6
zG+Iwp(N!G@6Y<$`mLjh^IyvSqavS#+dh^57mfr26aVl?!L~*7tPDF7XOp~`=h5(A~
zv4dT~hbdQY&L;^0Jz3&*!UQekRxJLi^<*+7!uLWuO_o$*kv#$t(VCq?gfMCw!Ul2i
zpBJEziTa%`xWsgX2UM~t5s=OAgu~6t1Y4k*RG=kAk{3eIhK~w<HEG|BczD}lVeIpw
z1*#;iW3?a>PWF?=B~;x)`u$ZT6zSeywT>>3B?Uz63P8O>1enqCk+q-{rzv179UIAh
z83#Qk2!)Q%^2w`VeXUllfDiK6$Xz5(s;b9<EeZ+!UYS~hoPxsr`%%&`^^1XK_$On{
zzLrK8?z%b~fz$C6xIVND5)I!tpp-F`zQq_6hSOFC9-%Aqp4kKQrbzR~N%L-^f_vnr
zP$|v(fy5qFq>8?rCYs(h$^Ge_MGz^PF^%F<#4{TUG-0CO;43+wS&ZFf<YOBz*6<1#
zc}@eEyE+|-A(zjmz)&iAMQmnCp)^~h2{sQN6zKK4LF->Dx?&)H%Ut4&oE`j8nk$4u
z;1#G!w>68~=H`mo0|$c+Dbf`+t&bM6tkXK>%LepQ-}Z~hdf%2*6t+C+uM#+{A+X|B
zCpKyO3GoW(-RsHt&UBdYq6;e1j!OG6<pSEW<c=e{w(%BLn*~Q@d{{7^4o1CS9)*W;
z@x$V(KIEa3A!Li_jgx#$oP;||0-1z&k?VoA*{Y31zZL2qgL+@pc)-2e>09%R3a#CT
zr4{ZYV?7t3ea_tll{|psFBDQE?K2bnJ24-HL#0sRuPTI$Bn_+E(b@`};++Kny)e?K
zTVIM^Me(Irm3zjSGF4KD|K}X?Eh74OVY9@d>0kd&^rgz~GKpz+b=Y64yevvZ*KHXX
ztT+*qjzR@U3<4RGi%r+j{m3xChy#{W&=v!v;j1zLZkZV^b*q%R)gpD<h8T3Bi0Pjr
zf-<V@h<*e}B^tS%G-#D+h|`W5Gte2;YatMc)|gGjuqjmV(EqBD$G~Bd<kz>;2tu|m
z#`RWpJ?E%B>KbGhv6BO?;YX#+5R|mfE@Ht(gx_KSmLb-IkMF-pCPj}i?|~7tcE<53
zj3GKjmRW<Li8>ZHI2eH5t@bcmW(&A~1`;Fa==MY=ts7lruR7q#%j6n`d>8Q`Ih1fF
zlX{E>dENjw{7$;r5|3eMGm|zrk$hZ@%kN^2-o!!M<dOx(AaB_1Plo-lr84Nt{it~b
z^(NScevF$;C|wH?zTE%?P0&pMrS0~Nh@x9?B;6H)ZfQ58zJ>--%W!#Dig738V-e6*
zbk>$L<bxWoD;VQl@Dc9FKhhFzUkDa~KGNdeY;nu!N60FS$cnP>^*4_2V;ey5gU)e9
z{C)UHx<3iJh#$J}<4Ch+*plnc&{Ql)3c!utuo2ILHfmyaes(*izZ1m@!Z98<DHsD@
zDtHWh_BxM&Ht^QFHlR(kX4<*;yEdSSdDD{4zXeW@-AU)~LjMMn;p=oH1x>voMbX0G
zLQ>sJs($4ULUo)~^?XzHQmBS7n)snHca!Rmt=P)$NC#1Qcf~2nDSMH2T^nL<T)N>f
z_SF+^8-oT#s5hefM~x~k2&03E8|`dtKv>5VZndis!A8x~6-A9!*<JJvv3?VK5be%j
zGis<eV^bF=2$wdfG1;hUqv?iYSu_a%yc?KXODA#Yy;`YehL`hOK763Tu}m>!}-
zFHqzPb*%DBWN?d0x;3Yxh<pns4T%xS0fWxfrm{oxzn~i(U$4BdW7&@h+2Xk%c<aPk
zL4+avR?D!C*~)Ef&?NT*!*6xWlnEw8&f6CD<aA7RuB4((h;|kbKVq{&J;xOGS`<eT
zmOr<~@O~X*q%|h2{+Z@(+?T2r!iI_5b*?c+xf+%Y>DZ3jBbut^fhH9fNQ>U-aEja^
zvM)BX^<7}ZJYkU59ZO*#wRy~|FoSMey-V9P&sih`S(J8=T2<#ReTB|P-w0;<v|K|d
zWY?BLAtwbelCu?55jnku91JL0BcxPf$I#VAJhaW<y5k1LQx(N?a=m^!lwiI@Ac@?$
zrg<X9(Hh}iWaoqSrbfWC24CeK%n3oz69c+@eaH)MeEK^W`G#MAmkL_6{uLBEF@tEK
z-9n@O5-<|*tOPtKIW|ZRk-6HScI#@A9dH!SpkkOPICCX~Q?6hyrV#q|QVTPk*4Y}6
z%oj*zQ805$12c>G2lW0#B-Vp{#Z`T%;A;`kAS644b(sywCPB;Hd5om2h8ivO7&+qC
zzoy|GaD00?i4zOMRN)~8A)e8T^@*d7XmEZ5j2YE;3M1ohC5FJT7X_cXpPOJBc_n1{
z^<~6IDY=R?V(GeNl^8cZ{mB<`q0?Jx{JDEXM?qUWgCBtUU3-$WbnzR2m48JF@;hz>
zDU@IF0!fH36#63Tt+(rO!&yOu#2816l?fm}j#NWtA1ty^rT0o1K37xoi6BMWi%m_h
zs}bGSp^;><DiQ8yKq67d8QmKi<ZYXnaLam33%(kQE9EeRk|*4A%p;>>8ZQn9g%u!$
z6?nR&Km0rp0mmzsQ4znJ3d>sYWYR=NG}CLM!RighVYq?OaW>9!gWoOq<8QMueq6_l
zqa<SzUKhza2MHd>HN+E$pK4=$$YdoJOSG%nXx%A8LTMbQtzn)|kA)>-qp~q3^oR&e
zqKb}zHIO{|obZB{@B&`-D8v7@!e8X{H2596aOMl1+_OcP^H?qlO9*<xoaSZFEWD4U
zWaHQSJcOss)MKh@eD5YqYDGMzODVY|#h=-b;rmt{xxi2(k%3{Wgk;qbx4+B*q8IFf
z|8&7fLteD;EEE3|)cEzM9yB#3Ou~{tKY#@VqL(3rMtYmu<l9Aj)MA}wi|N}0hvOt+
zO?=aI@LT*GU^4ocF-9yLMWlZL)>rG-v*4xg1MxiX;a*sdhK5mV;rR?V3;JY-QEwS+
z_FD4bYS@E^10$gh(b*vEQPH`7kBZJ3;xI5QP&Vxlq-b|1p?yHmiXx%v4RHhS_~1DT
zB(2>=1WzL(zrFz}ftD;gC|Odc8A5PMp~dnL3#PXE(0#iF@t;Wqs>?wG)z+o<ltBE3
z9Lg>*S3?xg9B4RLuAe8GfbM(K#g+2GPa*^Q<pM_c<5!q;H(2TNrRpg(#znR)w_;sV
z%5n({)$I~O7pv+EvZ{JkXrrvkE?gh_i>$t?h;NnE7FE4LRy&K~S05^t)nTeSOIEW~
zHA`0KtLhk8U8Ab0vbt4O`^xHPs(PZV9!Dt%vF#+Q=d0?WTH#TOs(vY}SE=f|s7C)d
z%ZX>7bhp#|d1vb1hrOXJZ`*kZx7>+pRQd};1gF36NSsW6`u$rpZN=VLpMDoEn(qZ0
zddZGwq;ZdsdRjEY(5uN%pPmCUP)wmhZGde8^;Ws_>p1&TzyrfX!Bb%nQsg}PKP1EN
zm`&pSdK}4+z4IH~Q#Cuu1^swHSL}`R={hc&dkEN17#rS(7@z(vE}GjgC4?$4Phh!*
zs|SQoy}PpcAVjgv&y)Q={S$COUlSF|=EIv{^LdnO0*<n5Du#)IZ609SJXYHL1Q;OW
zPLd&QPJ<%9J{y1)dlP;7Ww>abC1xv~-cCy~n`g1*JZZ}%Bmn!^R47}@$d*)T%NMxG
zf-Rz8Th>F0jGyqw@8~0K(MX20<*A)y%i9>Q-iCCa{wgk-_tAWz@5JSby?{MWu<d|B
z`^QfcXMLy+Y#ggw$yT5K2athcIu**kMo@1}llG05_KAY+(@gt%k&k}IauV;?6G?`&
zFJ9O;1ORUXn)W<gG!K%tOjEXuwAk_sUHkL|5`ca}g|cNl*)m4jQk?~OV3;V_77sXO
ztb{*)$9I1dGWI-+LfUfU4vN*I0C*eF@9)D!^TQ+&&r^sPaXAwn`E=v2!1#gj>x)6O
zVsB@kUPf~K`n`&9tu*u(l0mPG2kIt;8VNPxXE<==E6GJKNyRUAYUmd`HTVUo-U~Tl
z;YDCtAgpwBL48U<zdr|(0`WUh&~JE<Egky>XxKq2;44&Vp;gd8KTajBJ(Si%+aoPn
z4Lp=L%hC236)K9qJcTIclDz>(c_tOZM8Q!U2u>Nr@#LW2@e~-K<vuaS#C$Reiu`&W
z04w%7efkVsw9Lg`JxXr_hQmYxT<X`y2mm5CoB-*5eKLunH752NNvL=(Chl@*&giQg
zBwXln=m_14gZ9!E;g3Hvh*KW0yQiSUQ5v~Az62)Y5El-LgcGX1!&5H-j@(m(oc|Kh
zXrZwa(Elp9Nu#`~554;_;pr^<3K2`1qkM`APoKws0dznwF!|&a=1P+<&g7dB;p-v!
z2AF*E3c11LYo&<zNZ+{;zWtvIeTSwIj_j0I^`U_#-=Zg(?*~N%JO3*AUNZUQRek7x
z{?7i*G5P8vd<BwkrO79+>O(tBKEvc&7vUQq`KFkB@~S>`v&olg@)bt-4t*y4>uK`I
ztNKuu$=BQD8xi4qN%DPkDaS=#;YA20pJwuPi}0<Kd>c$Yc~u|!_#=)>=n0PJdx{G2
zoFe&(O+I;5A9}>(Yc%;bM)-P4zM&?cys8hCn0(bH-@*vrN1uwg9L{F{<W+rWxXHKP
z>}L}qd>bU+9+OXA)rWqe6B43duQK^gitrVKPYjjlN7Dw9vv2!L|0doon^4$x1F@$}
zP%&=x`_X~vhw)P(7Qar^e?Yxn&%if*jC%c4zVxppE`OC2OuP4~KTsYy1q9w;B9`Xk
zkY@Tk1}=IzRKPK2lBP||5DQGT)}+)2#O}#d44W?eRXO0gt+2?uCo$CZFbUm*e&7wp
z(Bk%OF!}Z807%lXqCgt#Cl?O|VDasiDP&zWo_IXH8_Oo2J^%*6k|ZjsY8K;>sj$(|
z!#KPHd=H7&sqpCw?}Ecsd*B*b*D?mw`2f*WGMb);`t<2k_3N1^{A-DjPQ%~?dhMq=
zoEa{NJD=c{GQ8V~qSrEP{^$>h&P)m50h)R_-CYD*TM__$PC`R3zCbSTLzhaTe}n)|
zuf+64IzOdu7I4%|BzBRz!(cXi@)4nTJ3ai=DyJdKzd)3JJ@_as>>n4?LwpXg1%|B@
z+$zoq;Fl>mi6ZWIlz_v)V=eMas(2FS;DHLsR)`&R|4I?5B@|kq-miakr?7O@F6i>>
zsgRN;x<<lnEuz8B1jHv|(!84hjxAu2=laHU`A5Dy*VkE{>+|jQHQ}LO48A(WH2L1{
zzuPZZt&@%Hv~m*}rVNK8bVdcgp3HySvgap%A57m3=0p|H;}$`=d@A^^@GJhU3FK1b
zcC!oQ`5jXp2Q*McG%^@xe*-nLX{AY{q?fDDM{jCg0_P*0r+EW88o&1ra<mqQT&#}1
zQTt1u{;j@UzD9BsU!7ta``7wjl~UOgbS(4nXZxO9e4X8sIGZOIQFn6OL<7<4$rbo*
zO>`$sxNRyq@q%ctl83nkmre8Qb5N75*D}=a7!FyH2I-+@f7Kw^_|jyo+VK0Nc4D<;
zN6Y1rAhnEDgGkzjwUXh2rIwah@;uqUYopEnyCc$hkZr+U2gs5%ryMnpz!MCD=LD<c
z*6}1)jG9-rQ16Z&HCN)XU}XF(Cmz4P9!;20{e~PrCy{`_u<=5U8b5Dt=JE3-$@Du0
zf+KqT6oaIL@pB`gX#9w*Wq<zANibS()sab^*i3E1Z6Y>&aTLjy8Kiwiah^+T0#gSL
z9WQQOazTbCh**@>2G#3p9{3}9g`G1xy&i?Rg)qJzv6k1G;C7Uc|MuZCzDmwaqOR|Z
zEP%8u&-7lf?875(XYoUaE|p#DT?aZ7;nD+LMTNMO<mP#lF4Wm#CH%k|GO}kZtqt+m
zaF=JCjo>vw&_fWkzrWuf$wuBGr`0A@{Q`Y{eI|MXrP9ApEn4HPrbc<l8HUi}@dg3z
zj3)rQmd&%VT&`?DA6fDs1m+$_&*(=J8K?Cv&!XE#>**&2#uBx4&Nz|azRoAKTyA6d
z^mZVS;_!9?3x;g_gfRZjG5$=5dWD8d?N^w7;&%#okK>L0yx-Or50AKw5^h`b3$`61
zOKIv*mUhmyXrV=Ho{$!4ipo?QfngFH7AxCvnB&a|<UfP)&%NVet5$lh>mT;vZR7(s
zyYcJ^FD;{ASLpQw!^13eZ+e~_62CH|gVOrF3WHt^jQ&W35JzJ0LJRn-_RSX`2mC_9
ztH)W9^$`3`=4Z4RK=^UY(k5*YJ-KVTU>V-j<7%SLDHM-#1((s25ezc248w@zi$R8K
zs-I8#YGTaWK&+R@QtZ>211_1Ypq_|P`3pji!HQixUMqAbEZRMq=TJcMSmY+g?*#B*
z!d2px4{h|yhZ<>?IF8tgJ`&ZY$z)0hUq}yh2B2LpvCPBi{Dgt7;Pt&C*OlGHbu%VD
zS_l%w1+L)2GsU&=kgkGCL87aTt|%TBuLmm+$m_TxyzWWZBfS1)^^smXUIB1kLNZ>}
zbk(a$?Um%$OvC}PRyNIWwMAZ%5fz72VJ1QlL<oYip@XuagVfLzV)8Fk-i<Ds?*~jY
zt=WL{m09G<Xj&;ywly1Y5@Dq)mMlA@9arLi@pA*R;ol`Ji+5tKo|kaBaHk3G=zRnr
z8P`|N5?-k{fn4}EdiBRl7gg2kK%zXwx%j=l$_8KV9@U4vOB=9(-2Zq=c6-#)c}fq;
z!h3rs+;SSO=!|yc@z6gf0-~O&e1>a&rm`0~7(aN%dP&=P5xzptU_mZ#abJh6@`iRR
zQlL5P_rlaVJ?SA3H-B)P)FWgJCK<^0T4;eO!)MB<w8~fo8NxotRgfc6%i8;dj?gD`
z{3f7J0Gt}-jVGz<xR9#Ilo+G*#AE?hf;4ooj-rQ}&dLL#Dg>>TibCVTDRuk=h(Hax
zR|>rWI_mVf!iAHS3jxR54=A!%71?x<kp$u0ndBY*w9xH8)BXhxk2@x<C-*-Bf$qf=
zh`b=)S>sn>i&d6vdxF$epr$o%>~4t|Jr<A=8zp7>Gx4hiT1r0g_~Yl%nt5NFFS3%1
zsiB2rEG;9wyKw9Vv<*hlE0VU^x$no?wm!Z(Uqk;^Z_O;<J|Cs{<HTx9>`K5`vk|*K
zMJCOG$6B*PnG|sB1BnP!9BxP``qOX<$c#W9Svw0~p#U~*5CK>qbUZIsOr#?b0!<wc
zp#kdjB|^tD-7IcL1dWdtw7?WJ5tf*Ou23y0+5v*Jov~y^S9M?-k1tHV+pT>4tbC^s
zUvJ`j6Wdg>hqsI)EoAt6R=%%AdyNx@x5Y`r1CGte9)k9@yJc{Oy(yxwMs_j>Bnb4A
z3f&9nG}lL-V4c1QIIa09Q&bU)TA^sa#-SJ4r7i<Rb}4?7fhG}4k3giyN!#y%G-|Az
z$GF3HV_VYik*HfCzfQkam~oD2#%CCE!sH*78To>2GURE^;jWezqqxbA`lmM8^uKvW
zojaD>;u4t&s-aU0UG+Fw{sKT+GyRrP20NK8AA{u{`TiJA%OJnTpNpYOd3HWpmgdt)
zIz=P*F;x^Q6)~jZXT+^e-!D{jGgZ6{Ng_B`Disr;0!^Ws`y94lv_hW?v}o>j#>=c9
zWYE-jxjBPfoulj!u2u*;_6j?0?82^&W;>><F}Y(eH}ivxnyF9^0ad4W7ZS!O31ZMt
zW|!^v*8lvM+V8=1cFHfUk?nWiMl$IKfVAc(bxW5TX__?VzhzQ_X;PaU2hybDgh>U$
zq*JUW;aWxM$?qve;G7O;{|M@j5`~2zZotHHQN5htl`$NETdmseeUzJve!nPs)Z>4a
zo%Rr&RSD=HszLhs!(yu2OI115tubdhIm^*HHB<*Qel>C{$#4a+9OwkI>DEI1BrCXs
zCSn>CLrz7B$p}Y?X#GY%lf?Te>Gq<K20ZPU#Gfgwj_iy4^AQ-K4)V}@Z?LrNzU-mq
zZ=s}}V5~gUVVsI2>c}`fa4}g&Nq4SHJF-<Op_nuT9F$)L{z0`dw-+Z~>hvLqi`HCd
zHDnaV@6HkU3jIvyn8jq2X^QV;72gZu>-189wC1y|G9oSef{@W}qyJT+j&x@KXm)0;
z%mdPyjcCet!j;>xhLIg<C^t1cXVoyFeGT6o?65(aB<e^X5={;JtQx-9VDSM<1T{4N
zD8$gx;&Wsb*^sALBmJb6uP5;lb7-c?_k_upXXW!)`OYLhVh;5+`6^7l$E<u8fv-+?
zh~6<7eL;4jF6&e$I^#~VvOaVR4%nDYl1Jiak@#KTMy2MKQS6fo0>CHRv*{u3oPT4v
zFaJr(y#R9S^j?UZ*8Cd2r21Ph#6=&zPm#q5vVQ@riG4%F&%25wh#?H9)^Y{0ieFU*
zcbozMSzXo(r5M89F@)uzSaTGM@gYX>3}JR;6eq)8+bHh3Y`yCLE$14zzs431{n9fK
zf3}VHqX)1_*dId#Gac?M(rHuW6jZ_V41!GiS#DgXti^S}(MxcPUALSEsqg8liO8>4
zDOw!Dcf=5_Umpye8P)X9AwumZBdHknDHR;%JgAn_)O94wuhaTPTA42BA|4J0DRLiV
zY0W?3%e)Ix8$f$;`kW%Bwv5}^XtnMaH{C)8;`WG~6&s|k|4hK2g#bDstFOLW_?Kie
zxN1Ef15CB#VH!d~bWZ$iICf=?XQB=DGWA<}8-hMc3f!MYk%~lMC<JnbPPpwb?!#L;
zuW5<F=^wotap0V;7<dQnM|;i?14G>?uKW=J!;Ru`l@oCr8C#h_E2nR;jZG`36xhux
zC(=F|TR{A1`{NNLml|6>)_#UW|64>~hO6OM#=G!l2$J#kOhLC7jrD7p&u3=8W4+j6
z5xwur`w##+;<gtsoMRK+X4s>`Vl|3mVQ$N%aJl>P^XQDKnh>cWE~ia=5$#MNQv7=F
zPb9K0L@u_9Tw)dZG)SyFUpPRH?Fjm%w)H6--X~x0YCxRO<8xc+2ii3KbrO5B^`UQe
zp|1yAEd~yTHNQ(lk$o!l;|Fis4A1%aT0Yp@Hr~_K+jf~Jq2(ON2-5}~PMNiwg34gA
z&xlG7RNSh9{*YVk&)kfqLzCz|p{3A=Uw#vSet{;FBHwM$O+|fZ#=U|>qGDXtxQIc(
z=!X(uBcsq`d!b&9SGY7NSRzo>^eT8)HO>o1koB%c;W{0N3pn1nkSrEWHW<N!@pAJf
zp^6^ZQM1^7YW>k)qkd}rpG5dZ5uN&wzt$4rZ#sx@mS-&9W3YUZw{5cLOnfaL<88~J
zSdaId-ZBIc9U^)L!b^dl#}x`Z^$*bs6!-zEa%A8;!;E7D{*!xtRp3P%aJ((8aJ0Z*
zIou5VR)o|N_<#P=Zs7A_OY|lbx;7p3KB8;yr}pJeD|+}I(Ze%DDC$GU;gF8%;X!fW
z)!L1)n=5)0&a{4McCLh#9-1a3M!a?6GS7MF_6fJszuM|;o1U<8H+0g)?UaD%4HRd2
z+p;|uHvbb<oO}jvgY0Iru^UC_li#r;Hqv81AYK9jy{-Xsh%o!r)fB1v(Da>zr;VN(
zK%(!;<v2;bTS%ahBqnZprvrW-<tcQ}!>?MM*qCq9t;k(c1%4Sk0J>w)m&Rd<PXj2T
zeYMj5?he*|QwQ3g{fp3!d72jY7@)X_!8<adhH1jeThY8Kk{3A>R^C8W8ns^6e#^*h
zSe4j32Pkw~R|D=bybJc@u3^7@G-g4krBmE^Ae-9Bx4i+ZzAby`du<JU2VTOrt*77z
z&>=t(ZahZ)NL}HG0j{u>g`YZ$i$*+q6c?>VK+scLy#EEJnocx}trg^fs#eg*2A)an
zD1NEw-A=zA))prlI8z{&SGC))f--+xbHxzZvdL(-(06~PxCjmYwPa2IMqkwzDE&g#
zC~>_s%LuF`RiGpvRqFZ&cqJJ{USMPuOSBl_8q?G7H+*Y}6-lvWykAH)*43@lNh&6U
zcBK1+bTUQCVkT1)Su1*+j8EFGgw;f?bfWeA%7*B-R{RliEsZBrX#<`%))>CrYI|eR
zuVCt_t!Nxp%Xis;5NSvJ(<seY#P!9krgcrSX&sCRxSA;$*1Pt{gSTaxcj^8v%ez2E
zO-pAXiBdjBHmz&bZ+@vpf(b}P6J{71^`TURw;Prr5-*lJ=$DEDKjeF;TG|)w=VNz8
z`Kf+ph&OX|A!p=E&+tnIY#>YjT5zO-9{waoKaTg2Vmu!ri-1i<5j4)iGaw(v@geNQ
z;$HggpNJ>Wll&8;wM*MNdlwvv!4ru#t@2S}U(^FCS|D^7^8g+{1n^J9P{8GEoki`c
zer<`6lDo|ZC<h{{%Z7lk26khhdP7}d(T)V*eYJ0Y<!eRc`od0yU{hQ0-c_1oiKTsd
zt6$gx5qbuha<(Z2pTTQ+uhOoT{!;uA)J>Awsv|}`ehVRgwaV9l9QoEB+rK76jX^KW
zY@d*xm74!JP}K2?{&<pze<CPX`gHkGVmAy+kpvMyK_q$r;+$%R0b_vPYxeeM)cW19
zT0E!-Tc&mt-zb^r0Mx6{5F``fXjdRnQI<v<#3$U=U5s#YKDxKRct=!kKgOs&vWFeO
zvj=+*Tlc@IhxJ!Ij7sm)R&8-S=D*hN<TKoqBkBKlz2T}3dc#wHwDpEE<eQDi@$1p^
z*BzPTH&Pl#x<+qYFE5dSD8Xl<!{gt;kXfjKo|({ZXw3;>JzuKjeZ5Phy5>eaoAEBK
z?y?Lyqgp1VBeFn)$N~y$W+vo}YK(*<sB0v(HD~xR3QWD5MNXV7^lt953<GNOk$Tr5
zabu1_LR3goZ8AVMwM5(`iKDe5L%Bj?T4=?OYkl}PJaAJ(LSMqGK3p&Pb2r0>m_~0~
ze8TOxyTdw^0bQ1j1}!3-h<h<2!rgAW2|EgF8cDC2Lra;=Fzbj^uQ;PEZnRkllaaQI
z-lE(|FHN+*G7$+6x1j!9kkH$zBniEbFV^?2JkD0@Yk>HehM1cEyL`FP{=RJUch0f=
zjY^2<os<v}pj0zxB0w+)^BGMySP7^^OT;ZRjvA6E;pqM(904?i!;SVN6Q$R+85wso
zGVYUDSc<!=rPZjNgBqkDqS5(x)NN5C&N91m9|czfk1a7_YY21*&>9{@avB$OFcr5_
zv~+5ohV(>-JGv|z3KY(Hh=^DR2=&wuda4C&_L=ht2tV!gK}8?kQCi_vWAyq*&dl;h
z(O8a%!!KryUh_$o;pNq031YFp(leu)p++_X@=B_On!-soA32p?3M#~nnvVSFNNj@?
z3rZmtl<kTXWZnh%*8%9bOoHzrnLsTs`oig<bR<iReECj|Ba<bvBsX27ylibb8QLga
z;%<eSf;Q+vrFcpTt6OQaG>D53UXRhR<Xv%P>plPWNA(`>fmE0PIpD(qCc6phop=xg
zv5C;ws%}UUzE>JY(+(29SexE=hJI!x{ED8-kZI>r&X4ZN*S2d5-jqqEAZ#HmLF^q0
zsrYxps?39aElzoIEb?0sEa;FI>A;;YmQIP`dKBKBiQ9t1Fi>lL2w!+}2yGaygB=Ju
zJxAL?IBcTKhG+`pXYZ1=dNx&~WXtgDV}NK!Gq8@+1y2vU>(`(F_Pi>ZEVdnbj352b
zN>Uk%Gw>hdiarsu6INe}6=BPXAbXj}#1HNCZMdYDpU^s3e~RE~yOyIn=noJg7YOkh
zMW6mh!r_drD0D9s@~BAE&6v|{cp2$^FhRB-{<#WlOjps2LICtZ+PcIG<?;TjTM-~J
z{?TW|ySUNbf?oV^e&=HRe5jQ80*)1DQ!&g-1@E~3%f~zJI@+GkgF2<!Y}_Qs6O%aB
z64jW}iZs{ZCGQAQpT5IOw&^dRpnpq6|C^}85FDtIW8z_?Pg))8cMuA{A2<!yw-7xT
zK~%RBW@@2jM3xLxXPAzsmD_p)$}h+G<Sb!(s1RO<hODxUoJ8N!!x%WNK6)~%?^z`I
z>7I}Is}+AT_@lRNW$^v_#l-bP6UJ+3Sv;f+q&g}~PT6)X1fgr*AY|0-40j@H(JjQl
zSWLf+k0D~}(J?gY^tpl(Lk)MFw9v%u@4lt22v{p#D-l|Lu!VAGtH|I82kzy>6XiBF
zq935-FOsg~f$MNdFN2W#9k(cqNIh`XM3gWKsiBqb^7{zeK`-;`uUrqg=vlYXV=4UJ
z3u@4_eR>8i{Ekti^;mD_t(@)GPZWf-s*tbnrB#Jq+ooQ`{FmCV&%}}_*a_P!u0!;m
zUHV;r-9qu?BQaqI4W8a4ArPO^4~6J0)i6j4U3!biRY4lJRW-{_f-|^ApVI#fc8oT(
z$_5akRle*?tg2N)Vy4#1iOABHa|M%V%SR;F(^jr$<@}CuguFx7a>f(dscB^J>6r@k
zSEP#Q9z^Txb?AO;U|t!?J<GKQB5Gjt8YJ5qyc9MO^#+_o6#qbF<~lwB>cRm?dE~LN
z9b^q7#R0-IA(;*eHbzdigt{Y4q6+ardWI7NB6+TIW+nk#_?O786o(LTxH3`y+9_pN
z4jYE*ZjQ)kC8F0s#4-rrqBDz;bD3@QqsXnGspvsSab~r#<t%xP$4U1I9bmv&)<#1#
zr>1Euj6u7Igif96H<;WZapF~SlZEkfEZjB5R&de>-1N^|iGQHL=h{Sqf+T3Llg$VU
zby5&H9)B`W!tQp>XeI7LeVDt|Vg|V!|2|O6Z8Jk0(AbEkehM^b>P?i<sAt|w_5P@P
zT^ng%7c{5{@@k-$N%CqYFOlTcAg`u<t?H=3nrgld*|reCrUC7kVhXps3R(2rO*nQY
zelO6@_Q<i`evB1k`~RGC)IW!TO`(<nzlu^!lXReBznHLafE_bbK>vCULPQVSko@GJ
zx)NIu7&p=XYVhYb+44L8f}W?T6rP^cFAM%8<TgS}9AsgHil<B(WUFK9otM3iCK#6o
z^*DnbL>fAmt|eU)9VaBVYih0p=9>Z&vjZ5>gQMEUZcRF}{P7yHJj`lS{$gSd=s(XU
zckN~sL8uq=0=5ZzFMC}X*k#!^7#N!$G4QLorWqF!$I=5al?c8tesaY4)7bcc<50wK
zy<Nk3nuL^AIJzf7|Bk%NCumj#x>G>)y%b$>bU)zOMAtHG4@7vbh(P@jXh8&88G-(d
ze3TMqN1%%$w37fmcF<E*D3()<rMN^Xj#}3jf&PRQxe9WpWmpL6_YzL^f*T`r^luCw
zsWuWay-M{S3Q|#|0qzzne!VL-mkLr#1iOUq2Lg`V2y~u+TK~D4fa9_VdSZlWP6RqA
z0=+r{Jq^&KnJrrS?g)L$57gPCRo@dq?-gjl^l$|FynveHhazAZKkFn`(5ndeBD~i}
zBt94E(ghLdtOztK0(~?B9T9<E8iDo$R5`9`8vWz$xQmO!PfO!=qyGEn_XzwRf!`zW
zdjx)u!0!?GJp#W+;P(jp9)bTX0x|S95PXtS2N&RL2z?EuuR#UcxXKDo=>m88Sz1y>
zQfgX(Hn*~<xL{zumYG*vT$=BkmseEmE^y{~oJkcUlM1?PnG+^X6T}5tPGwnHagoVe
zT%b+P$kHab#%h@v)3l82X((rElXJ81H$}_Hov7uwvbCw>rfXSv^X7QhIBifG*~EWZ
z66`4~FRJh?$g6<#1+YuIyr{&3BB!jpsARr20aj^Q?(3X0-Ff8|&hdGRwZhWkqJq4|
zgEUuO1t_hs!HgWj7eo1C7+(zMiwm{Mc_o#3<%_im?z!dSA~UZ%zfjA7>SDxWu{OD~
z#Eq}w#ac$?e0POM%W;=^+zaNyzNz`1(sH`UDqScc<K6l4BB!zh_Gy`=qL^OkuAu7W
z?t&6|HNCL1T!1E&lOtMAo~N>$E|LZgsX$1QDx66L1MnBs>gl(q%X;e=5d2~01zEF3
zI<@R#cV30tnO9PA9YS&LVt2*4&Qe^Il%6|K8>o%X$(cQFYSx5F6RElLlPa`wZSI2E
zm7b#F3bgsuGIt5L_&l&daHQe8cXMiET@xo|X|AmCTGAh`S~6=%652e!w6et0VRxD4
z&MR=b%gZ6Hw8U9}wsoGHRB>*1VM=z!bVIA~<av^lD*7wM1A!`aE-tNfUYA$mae7Lf
z2x$3r<wYJh*)s-C=Zwpk;&M*C%r$Mol&O~wbWU@nOv}x3UOs8M;mn*m&E*_tOqw!&
znk#FdjL^07-6coq(K0NT=DMcMR8bjNacwaobJTF8q@0!Gn(mx7_41tA6I|2B8E7Ev
zEGJcT*AU?>*SP7<=~Kz#Q3S}$oHX5e`P6Bbi~<Ek;DV%zfk_4DQkTc5nC={xI}HX;
zce-X|Psx~+<;=;-$j&jQPIpet8YR8Zw3HNQWyv)qrPra6iu2|}Ony@Sxz3{bC8gzV
zD6e}QM--3ocLshLd$hmx_-WmT_#>L0_&MXzXbCqJH18|AVbD#mTvb|m1MR@9c)tE`
z>d(m;o9Rr>@#K{|bMgz_3-X*}-6i>j3-ZdZ=}$36%Vg)3=Phtk8p)}gTj444U}&J<
zCNCU_=0$_ye{|LENL*<{ohi=2si~=&HoY9<EiYeCW|x=dyDKV?tDM>Hatxvcc_sO7
z=Qs?Kd}NVvrDcoDi{=+nI>rC=C_>Y6b228nMiS(r(lU}#ak29c&TA{(<wfoa=dAAN
zhyprN>z0z&ts8X%jPZ=DoJr0Kcv0dh%CDfV-YsRW^P&n*`Nc$1T#A85<E^}Cu5hBl
zIj_8Qfg})WL7y+AmAgH8MJ3MsQY73G59nZQffEqqwzA?pk6VqSZYc$(_5%03yvkya
zv%(stg?YuuZi<@hDHQaSx-pUqiz;Av-h4Nh79i-0nXp?*QN(U?4X3YR4e~x&Q##K%
z4|9;DpnNdbJs-B^i2w;1-BOAp{Kcj7on_pL6+|yWBI8pMp@aMqcfLq41$mylxs)~)
zWw(^9Bm68cy-p-LWJHRM7;?^$l+nr|rD!3=s9TCsM)9w3lgn;rX{D#E(j#3EO3Ly)
zg;8dKugrN-$;GBlVx!hy>@0JaFQ`O*q&`Rq8LjEgLmQUhCz7PE+~lipW0odH^u+=+
zb4evlOHO1ycX=MBCg*iUP%9a`rJ$!NeZnMbBO>4kZizEz0!5w}x}_`>rXdZ_Ev<l~
z-BLV}I3Byb(67phF{Zf5z`sZcbeH7KMZ;41<Fto{b?4<5M%tGA9FO)9@ua1|e9Q!k
zo#;v>r5@*eOw;K9@Ey~}#IXd%pe-(@&RgJK=q@fTqb{vF+T|J3vL<Cs9Eq`AR#sY$
zMxl`A=fQ?~7`~-gP+Uw&AZggZ)Oj>IX@GLZl_OJ3UXhHOH+N)yL1EFzl95@(BTGk?
zjVvEoG14<~;Ueuj<iwSDd%<t<cM|gPx%k_FziRwhW$s0t@-@TeE4^KjZ|}fgALL`g
zrJ>Bn-}|aQSJ_s9lCng_z=DbjF)5Fqlrvo`o1I_cDaYJijHsaISzP9>&`Ly4G|RzS
zIff$Al~y{t6ob4}D=I+`t<3jm6$?rW+*&?z$NW;PsLE-+(iT*>=P$ro;nG1v^5_eD
zz~Y0^GCg}7<-d;7?`7`t3K|1ZIathxBGY0S;>^RyL?@w9P$~w(w5*9Ty+-KTOIj7m
z2+Mk^dydOB9*c;KEK18H^AAY{vj)oCHCoK`Nd+UjL90?AMXXSgRM9#nEma#dOdC5b
zBPU05<xZQL?b62PVr7CpNE67gl++<9Ai(92l++7TQU_@nIg>KP+=2?#2c{0!1`pLH
zOv}g`=h7y+re$Vi%_NCeOu~XX(=~0<xD3sO^~Ti9nV1l!j-PY|EYF$la!t|Zm%H61
zQuv@$5|%m;v(+H1$-$drOv=_O%G~)yd06SVk!{cglu>C!45NlCBgRrnIcw-JEz>oA
zQf{U*d+L-)IfiSzHZ)BeG*lZrL<4S6nl?lmGF(g3BsriUcaEJpLmPrQQUV7JRu@Bt
z;-4HIl&TF*)i4HWB|R=9D`R|ymXnb?ev&g|TE<uk1SAg9217CeIYJwBA+46=Ekg?3
zGe|p=sH98yN2}#Qg9Z*tou?FGx&{cVRQhC<Iv3`Z7m1{Uu257`B&Jno-a_2u2q+~4
znR#h3m#2Xox$#&l<8D|-(`PKrxN2ThF?A+ii-{ep_cAK5a9}!figI>LNw1MVyE+^$
z#NYW_!{NK}m$EG!{v-Y-zZed$#oy^Kg~Q|U_b&d@wui&D`0KPI9KH*G{qgIkOYy_4
z5Ab&;e!13$pABBOI~=Ar&SyY2{gFK><4$k+Z1|~Tu)c=}6-by^79l6n;)NnacZx$1
zL#MWMfF`FT_fgi<Bm_ydm<FPvo=qtR(o5#|?M~g!T`-%56@(<^$B_x?^d+sKEAgSF
zK0d*8QD~Z&^H9$~eT9Od2#$+6t<xy_M<Oy&OV103H{!Y`KOClm%#A2RDAVWR`83M@
z^P#1aru_@06J=3RIGm31!mGpKIVd+>6ArIHc}j6OydGr@${LiVrQvW8<<v#s?~Jy-
z9{ecpz5)CwKg2nrIVi7P4t|seQLab%=?d_ptiBQaD8F6_eqGbtRp3YY50vRBSL5`?
z9F%XMT!Hc?oCsNu@-M5vkFwks4u64i)UDxgXB>z+$&bE)@|4x#@OYGyaA>>`<x_Wt
z!>ds)SQ`#+LfHpDq}h-1d6aD^JFP>X>8fd0-VMDdhus6cD4W(}o<KS4LCiBKeUFC2
zdr)r19P$N9@1}4#4o%gHG8yH7XYlwD<^E@(7v<#V!r@gY_n_Q}a^vQ3xB=y+Er>tL
zyPpStH_YXA;72L`wG5PfUI9PKp|61-<+ppmkMe!IjHLmk8!xm8p{&PyjN&kS-gyiB
zC?~!P{U~Fbz>l&D<tmgXybpeq%RT@<%AFsAALY*|<M5)WFFpc4%F!Q#ALSo|;757?
zr{G7K@)`J1euc6D<%6GtALS`ufWHU&6Ut<i^yw)JE#~@ZTFj!vnA3W6jaw5V;im()
z5q<9*q8Z;ck&3hNHxz$su0k85FKWjpPB^~z<b>m{i(8?MId$ZDL(c98sE~Ul%Fvu}
zSTGV0AJXT+Ul8yz;#VKiwHkkY3+N|m__&Twyg7DUe3#hmut?Jg|0MqEfxjO1nfOlE
z#t;GF_u{WF#>g8eO?>RjCjS9|-voYf6n~k?PwxasE)0i@qWI^S`~!f$7WhY_@b{Vc
zDZsY@|1|l<_IG+DmhCGB{uYeQApD8o--~1XYTze&!r|{wM(|&o_B{#ssSCs58BzGD
zCVnsA)2>5K#2An8kLbkm4**|*F@G1(CO$S`+TR0qHv@lT6n>G39{~Jh<cfbq;lDER
zQ-D8ZNjUso6#fkpUkrRB@Pni5>u=h(8u-tUb8d*@FE{z01b+02aQLPu{9+To7x?>u
ze>e*NR}+5#__LA2##-@`UG$$GuxkSFo2~fRCrthUz<&Vz$x-n?Uc}!Co+-fZy9w)4
zE5FOzDfTrXLg*+1&COmMDYDYU)|fih0Dr;CaJVT7|E7uG1pGgM?_#wf_TK{U1WzOI
zuOlC)BF~y_Y<N3$>Z6-9hd}c_a&Q64D4HD66dK9tjXZw}eo|0_Ym<h2c~*!KG#7zp
z3uvCT(`<;)%mK}~+r!~M+i83enw6k=9yFO&pP;ML7!g{bYa?hrSrZP=L>rp+bjlWE
zi$sg@51O9%8Cq2QV;fDs=>@T`1OJdUHl$~*j19?11a8V-!r>~bEuEH_V~pbZ4Cb8y
zRvDen5(0=n8}RM-W36M2ZKr-BuukC1fN#M3^rn@c+Gj_^hC6|)*cc8k0KMsJY=Q8D
zbZiE`2KbAt_zD?M!tV$EBj7hi`EbAK!y(`cFel<5szv?+Q+`j(vD<*hxm*kWS`$AM
z_?}N-PL0BUXyT^>-voS|Rex;Q#Cw3Rc`_XSI*R{O6MrZ0w>}jPKTf~$H1p^K2}B_L
zC;eC>&b7+#G}H8NKk%3B=s3?G0{-cp;jkmBFZD3<bWf}mHq?g0+oR-fHsucmK4UlL
z)F}HuG3}oYe78MV!$je|Cf);lCGc~j^v^W)-wFH+SYsR?h1X5|X5b%g2#3p~`dE>f
zNB0B&1J)uhM%n+YY5yVM|Ae)PH_E>2P5XLce)=Bxj_e-_d>+;`3!>x~nDVCs|2G@}
z&4|K}F!3JX-^5y|qyBp*@GZbMN7)xN?b{6e=6%QoQS!4*`TK$Y25YS8QSrOPjNc*P
z`@VztM)B(=e^0E{60kPAElPitsedT&OM%}Sh2Lc2rvu;Xy>R%YDExCK-UIxzz~i`i
zq<_bjnD{$^?}oMD3sLyLnfT4XuLu6eD158H6X^HPZy14dq^z1GSyGpwk5652{SI}}
zL!vd!DKF?3bo6<0gDg*xg|_G;d2yPmqf3g<sj6I|@TaRf<G=qUOh2U2XdOTwq;m0D
zp$L%L#E1TMSNh<#L43q-jB$nA0P*RqN{1@BP;A<&k0peL5Pdc(1g+ocgA^h@C#!Nr
zFL}Z6GNp*^PE>VtUGZW4XR12W{T!D5A)EsAi>fy&N5ZPkcD1T{p~C;D>SFFxvVY5B
zm1-Be%v7g@q^mMpm2*^CrpgtnT%*eMs@$Z?8dWx`GN{T{Rcd`T@kvyrQ<bTzOjl*L
zD(9$@ehWgM6{=jL%Jr(;q{<pqHmWkH%2ris{iK{kRXSCfs>*a#W~*|JD$7*4LX~S&
zDW(1T@3>(5v`a2Xxw3tXg!Vx}V~I-YbJo&&%{Y)meTe1^{Iy5h9xRIaD3z->NhHaj
z50@nh&f}jSpqAU{uaIooLIvmXv`E3Px6zArIB@hY&e4bIx5%n?lZ}4Ag5PX|f2ZKL
z+T@%NBN4aR=;@!UqtBmg^jQjimyKSmEhLNhep2CyoKgwJiE{1p7C7ZXf>YwyZh;dI
z!6}99vA}UVApEEDH4B__6w!0;#HGgiNhjs5MhiX3|5(*yaZNe$AF58WseVw^$wsRG
ztm>36sowb+eBnd3Q$1eQ$rh^jQgyP4>U~t5VnOv}RZmrAnkwm^Q=^Ydm6KGNrOF&t
zUZKhYRW4BFLRBtT<;|)LsB*0;?^oqURX(jssa?~KH9k`Z4IVOd*zgNSq~*=c#|HHL
z!lJ9MDPB-gT6S%Dg{N}ib&D2X@5~rGZoF&4L}SwAOQvLIP0hY^TF&&`%PyZW^NIni
zr=#*%`Z2GFPXPo_jJa+vDb{wFwNCt8=WBqp<LL_@>L2vsd64AMhv)4RWK}y+l{_!<
zJk9F~o~PBcOaFwE_)veNkD6{Zx`m>T8fIpnK1qVj^#jk_ynbPQtcUI4^#;plKDM9z
zVLw<N+sF2>A3Vl5F6=MIi|yoivA^sO`^)-SFY~irmc#m39_wYhSU>y8`q&Tlm+j}c
z*zMssaQrx499O%Zj{IPKET8qUUF;|O!~U^9te5R!f7uR>3;V<IVt?6wjtBFxUgl?g
z9DnBLc47b6PWGSmv7Kxm`@#OQ|7<76h2y|>vY%`x+r|3X9_DBL><8;(|5!fT$NE_x
z+r@TsoY+6MhvUKVVgEQDTxYp#H|t}&SuXRjAM797$#%0pte@>=JK2A>pZ(@|u>EW|
z$Cc%>Jxs@VmdAFnJuHXqV0+mg*2{5V|Jh#lm*d6ub9~r-jvxEYa#=6q*&de9{;)mF
z&vIEm`^9#$|8{%WZ?>2H;rMX8IG!92w!==(df0B(&;EBLkNsym*njqe?d15dpX@)!
zgX6&Qvir+&*&nu(<=EwO99Td5%YLxEte5Rzzga)Wh5h6>vVR;;wwvu_xpsS5FUw^;
zESLRYxvYo%U_I<N+spQ|f9yZ|&F#p3a67O*uCqNHU$&q1vtE|RcCvo<gY9NJ*lzZl
z?PU8nE^HtB#c|}gG9TN=b~8Qu!}8c(wx8`}KiLnqo9$)$*>1Lj{bfJcevT8{$$qdr
z*30r(KJ&5NESKf5opygY4y>Q;W4&xA`_2AwTsU4FAC3d-V|!UI%VRxud3HH$pS^w9
zFOCD-$9{7h*e~{n<JeK$*dO+T<uf1q#rCsa_K)?meD<H?#s0GW91r%J?d0}k``Lev
z2iwW^u$?TQ=~zG8&wAM&*2DI*9=40^VSm{kjxYPkabW*Be(V?XGd;^;yV+j0pY^jG
z_LuEse^?*e!}hX&>_6MV{&F1HU$&p)$#yay<Jk|ko8>Sc+r{>?9c(w-!~U{<_K*GL
z_^`bk7xtI^WWQNI+rxIU9QKF(V7V-Z?O=KAH`~qrvOjDG+imxY?d5o|UmO?aXFE9_
zte5>@{cJzm%YLyvtdH&Ic(C2<FZ;=Ma6H(4wv+v0`&k~_%laA5_S@}YyV)+*$M&$l
z>=)Z>x6^JH$A{y{_H(>gKGQLt?PtAgKgWaP$ab<{>_7X#_OL#-oBd~h*nhT@<Hqr2
zKe=Q$%VB$&kL_Xq*bbJ@`q*Fgi~Zqvu;1(t+s|=f``JH^3&(-=Ge6tMelVTg57x`_
zSugv=dhGVHA8Zf%%kg5r*?x`-$ARr*{VbpLvtG7?<#YVmFZPG^v3&N2<HGvcK8_Rn
z!R^5IvR@oG*2{3Vi}~3P*3bIbF7}K4W`Efa)^E3u?PohUF6=kk%kg2q8P9syU#4d}
zSU&s1a@ZgCoAq~OFZ;>yVSm{kjt9q!<7D^8j%WMWZkEITupYLX<*^;?KikjoW&hbv
zjz7nN{bu{wZ}yYxc0Ftl>*01_ee4hW&wj8zyB(~b?dLdfoH!orH^+hPWdFEiz3eC3
z$@J_O+r@HOKl{ge*gp1)?c%tw-y8?FpZ#R}*?(>?#xp(3WjmRU<+FXPm+fKy*e|x9
z^|GHF7xtIq!*;WMcE35UY%klvaF)k*+3m2)W&c<|+s}5eo%XnJ9M~`RpWBDy#r|^1
zd~7G{v)jRP*<SXS<+47u&+aGN&+%jXIbIweZfCZe<H&lMj`gxU=3{x>&aBt&FU#dP
zvpsA-`_F!HT-Yy;2iwW<;5f1zwwL)>F5AcPXFFIQ%VE9jFZ;pz*<X$W$BFG>|2aPF
zH~Y)>vmMOO_OV{3W4qWtwu9|vy{wP*+x_Rbu>I^O`^WaNy&NC*lS{Uf<I484oh*;-
zWPNNO`^WaN9juq_VEr5iwx8q1ezN}@4~`SdWj$;s)3bf-59?z;Ssv?Qf7u_li~VK0
zSU>yAc5{5#KaK<Y&HQX1^RaxUXF2R2+r@se{p>IMWw(p<vY+f1$A$f7dpS<*H$6jC
zkLX0f&o3X%kWiqP)72T`usEIpv*B6|P@gC&No)OPe;H=4XRUjcO&26~`~{hN7BDZB
zcKm^~GbbJm-|x+<&TkJ-{?g~I;)gG656`ggSN!t=zNYe0Rbe|ESKOU%#dG^HzA*o>
zi;sr?>9}t^R(zT2Xe{TB56_uo#j{+-ul>_r*HQ7Wd$=nIvL;2${8c~QIi@{+?jJ+O
zE#fXC$hcoJ{=(VE_qFod@neU_MaAEaKlA$+Ii&5%Z`wOH`DpSt4z0iDX!!e=eX`_e
z_$_@F799=m|MFkMj)vcS*JvKfg3RvUhlh5ZcQpRXN-oK;;-688fb+pAv#N$#@jUJs
z-?g}5h855KgYh+6ALk=Gf{fu@KEJ1dQ<K2k`ENFU7<V-JC!KQtR4abG62OBxyx`<0
z|Ji=VFAm+oDJn`o<9E1Ui1MH9X8Z$ZT$yIo&-OF^toIK_+1H;56jRBBiBb0Pp)MXD
z^IuOtt3CPL|KC_G{?$^GgvB%e)Yd(NO(ZTD&+|#w+xwnv#cxss9H&dmD`&QcXZaad
z+?;F0vwZeH=Juy}3KC>?{K?O>3^*Epin~|T__y<S?)pSj{20&khgG)qnx_vI&hsVD
zb8IxvcRa}W{3o4>q7R>MrSnttaf)x8w^n68RdOTtS9La;lM@FaMd632k~<$Kf#C`s
zHEAfAwct2#J8-|{cH{oTouBgt_X}?S?n>ydjTa|-9@M-Z;Ca5MlJ{%l%IAam+%TUH
zq_c_i;q!dIHvZh%lO(%#jw;VpB`5n|+fD-&-vz2nRplU6Mh!j%qvC%{TrDm|T`qbt
zs`{H%ZcuWUsdD9Sq{px1-1!^HV~6kmjpec2$Cccte`EP9Z<~_4<2RCDtK>BNM)Eig
zjY{5os{G_PlK(%yksP*@%ftUAxg6K7FR`ZIraWHBJ4uz?KmMOe@2%uHe`9<5|Hksz
z&!PV%xnq=ELzOdB`3F`0pX#nXNUo~PpM<QEEtCmMAPI<Fi0h!z=7l5}tVu#1lgcC^
z1|^ouNl!nfFX>nM_9Qb|D-5_`<e^w41XOfbu@cKIjJr`mN*FAJxLMt3u?m%vZ3wGz
z%T=;iBISeqedqkXo_mkeao6&ny;aF~&hLEZJKuSqzSDhv*HY#v3zS2Y5oLu^{jD*+
zE5J4VPnFmIneC|`AEI6L`!>e!q})gONlMN4$K<`#*KvFQ|1p0z`(OMs*$M4GNV^YH
zYX3jN_+ylhQ$9%<+WG$;m-Bn_48QgfU!!%Dnijqvl6NX=R<67}(|h$G?l>II;A)x0
z+4+|&7<SPGdloLt&R={n;qI*@=jR#b<g1~EoQtO+<u)FuGgOfC?Ns`5KAnzlt8dZ+
zC-pPL#{yi={i3hGzoWY8M_`(nXZHS;KiTW~UOs$HGv^rn{haE55CO5%Ip8v3uhXC1
z`8}TM??+%N|G(t)=bB90YwG8RuOce;|CI-5cSi-V3F7@cNPib`dA@^x;{P4=U+p|a
z{BGj<`QsSz@c@60_<;cb8Sqp2KXDuiPRGlaA*SKeTbuZD51VOdSn=~<U|)XQeJcBX
zFL(o5FCk19JLk_q0FUD56VDLW`o4*HZ-BoI_^HZu4kzp{aCYXJ)1@MK)c#`PJpq0h
z@i_s0Iq^(@UqQS#!25uo%Fn*nLIIDqL%*{#&-AgrWBfpR74eBTczyN%eTbgQzF~i<
z{cE8w<MX1MT(QReb{@eZ@tbe<xVsw#!3yzLKkRY$TX_T}eX4TDy5KrrW(5A+0(}_|
zM}vCF-7|te9pLv6ZwL4|@e={QANW*$sy~^)pF_^hT+<tj^GAu#5AY`)ewG>M`m6o&
zJ>b&LZ@AaDoBItqf<L02B|*8*)6Rjw&VLa<7~sDr{$PO1T>;bZtD%Vie<Sh30sao+
zM+3auy1mrtpKW@N_<AM(pQ@eLIQ?@?_<Pw+z;W}FX|r6llXv=aI=?$sd=&UuXwUnb
z9(jQYHvpf7and&%4T4ACKixt+b50t+%h_S(g!lsOzq-YP`<Ql!_~W1Qz*cY0JWBf}
zDEA2USKaRQ-Q5ldK1KYY+dNKTendOl&QAOJQ-{wreeUuM+_S`TpTl|}?eLOpe0bdb
zFOa`*_8pY^?=P?)i~bck54-zH5S&f?`|tMnFPQ5Bhcl88@9KgNbiuFdg6{z?{q-RG
zMd!;+z%z;RG$`wn=x+=9@jlw=3+Ch4Ys3!TKDO0@mxr7`J?4q~d|Z!*-*9+ZS7LI1
zj+8qQ^vf~YIhQZObe%j-yvTv0{rfZ884Jcq#Cr7w^?I3hmhtWKGj#1<*YAJm=WDRA
zOaC7HoKJg&Dz67F{?Boj6ix0g7f}DkzxRr}sQ(s+Gm;QXsDB4v%tlOGLHs1^tH0A-
zM|^<u=pU*7Uf|--{J@_NIDD=dJMI;iQ2#pG`O3#V;O;m<Fs7ZcpkHou`pldVcT)e?
z!FaoucyHj(KLGE+_*uqrrKbLg`V&DvP7q&9J35}90Iu(N)rQDtsNWa(e?sj9`@(OC
zzexWPb<b%S57M6Fft@pDoJ^n9xo3jvW_FkH)9dWaHQQeBnjc_>#a;9_I{kTOM$m3s
zyXZfP`8ZWORA|TWoTKaZ4&vK_c77`wR>t|@YVRP6H#6|USlZ!mu<!4polM~8?at30
z^ZZ@j&WBjvyQse;=*LexeJUly1GIB!hX;2u?IGG35Bz_O`nSKwD^5`V+r+QE#{<i;
zF39r(;%78``~dNPCw?B+|EGPjIYE4!e>ib>&mrx<flK=|f_lA7{m1xM<{#3|H1wC~
zp9tp5>0SEqT!;6Vx7_IsxVwfB%yiLT*aeq+A7^3SWw;J(qd%Ky|E=8jcN5=A{NXD+
z!0t48X9wrT!2gK)xa!qCH`2}?;4<IC{o+>YAAhe`boY%Rcqi>24D5f3__km_dcfh>
z$J^eH{vG5IXWwJV|Gy7BlLQ!ku8aOjXCLS1pnrcw{Qh9w_?^S&nu)eIyoBRw#%t03
z)A9L;IEyv21jl&j3+B-r;)eEh{$50UB50pW9FCRXptr2+*fNJ>{o&`J!z_3u?X-jS
z?U!B7Ig7fq&pOzVb)&M%8(u;CS38`MgowJ}AEZAfXonlR=*!(*Q>}Xs(9YqYeIDjK
zZ3pYdSExV6ayQY}hly_s{P`yFGI8CHjsWjLKbi~J;k0v<`sW9B+Q6mVUU#2Ye2Dsg
z=i0f)WZvw7zb1Zy`g>y!@Yrfj(takW*Keu6d)Vu{JKhjH4GznAyD0GIJmOCW^?EaK
z@pH##y&ZR#7lQMte|lhNDfRE}^ZM=%N(7fv|Chmh>8JiHJTHCCXERq3-@MiX-zNTk
z+Q|gtHlqI7w|K>Gup8b;{kwzl&~*AVnh-YumwpV-Cm*Ms@xcFkoSh!?2_D?seS@&~
z8RD<-T(_TP$bGlco-^68PY{>8)&zfLod@LpLwOz{{-E`7cV8lcj}w1@^X`yOHb;mb
zt$Sb@@qZ)!y<0tSg7~w*Wnbvwy4pkhdFs!8zX#l1jtKt5>CZDg!8rN3(`V*{c$N0Y
z+8)&PdDcun4%@V&>szafopY%FBF`b}XGZPNf3<Tl@Tt~^cR9Sr{PRlh&)?G5bzSuT
zrVHKxF7xQ^!T8@v`xh?thOgl`+(Z2X!8p9x;d9N4#w+T&eJkzU(eifwlYZ`VI3o%1
zg)aElY3D9^VT)&g{(K9#^m}`z2M+sWbEJ!%XPy2$Ge-ZloqtUGpZ%mapyTZ&>YHGm
z{)YG*o=dg9^2G!h|Ho+Or}SqgaB1gF!FW59cspp%InE9<C&b%ne<Qy*(Z)EF=h80v
ztGnP^fy+LwpNqW}yx$WX?K$y+N9LdJ<qn7A{1Es#O8v3tz23MN=0@Ps4p;HL!41T3
zBcA8Ht`gq|Ji~BOe3|;AYdrc46Te3MVs3QqZd#;G5Z`^dS9Et9BPjP`OT7&J9HAY#
zCr<o)@=6bUp7=51@3_nZFLAsbr=32&zctwB<dOTfrGKvp`uFF+FZ3#@|7YS0L1#@r
z;}0r*rr_}B`f;!RQRX@qxa*g{^p5ng-!l%!`-foN=%t;g<PEs{Wf9CeoJJF3HSLVi
z4$Ygbw6p(XUfJC-D#7>Uo2>=ey0JDl7}*x6E&$->rBVqu+!u<%k|ES&wpF$HN*%Oc
z&@9yLP^CVYtJs1tkJ#Ms2<SFEQ%WH_e`*aYqy;wDZ01HSuIUG}MoIXJYypHiqsWqo
zS)n&bsVcKurB;vPq5(CGZIt1S3w5YEBwKco5nh<A5PZbh1tu5gEl6^Ta4W$Uma8<%
zISZm0Qv^=}(#W$F)hwxmV4xD!im<RlSZ|=xtjVSSwdJNZU}{&H+GbN2m}FUSqzKTi
zlp4@&6-Qc7ZjH3;&V~~;FO;Fc;I^xU(3qqo0O<~VkL$zDym&ACMKCAi1F)Ck9T)(4
zQRLX`K+IMct_;U!OT96Wi??SlGn+SDX@QE4snoLXG`02hT`uiXQ`oYrz6r6YHXJXw
zG`m=VO)!D@B8@wRMGN2sbBF@s3(Rn3Mbtt_`WVa}1FI|u)vPu4%B>rg5A?4Dyk^7I
zcGX&@uDx=zv1`_ET(Nw;-MD%+=yL3q<tx^&vgYbqRO$zzO0EeCs0xM^1`*hAz;S`_
z5GZ$QJNwqG?_aUfF3c{>!uJ?#H*qmvZI}(M@~TQv$Z?e21&j<W*P=F801Y6T)(%EB
zQ?ZWA#ihp4ZWsnFU=EE}Ev9TYJn<EXU>vE2A5@^~L2WA0VBW@{-D=q?h^9K(zygvR
z1P@OK5Ff1tTU@Pfr4+H0N(8%1E#DYTVP!DP6`RKPZyB(Yb8XAON||X}gw_KTa|O^a
zrE<juTW?A}m<%oEtuWQN8dta5JQxyvb4a`83fFnQr&w}E;K?CtmVw0>yh6f(Q$$UL
z>jte0wSsN91kf;*Qa-?Vj3JM?$Pp|ZRMQ@cg@VWIDi*hc3((du&13IIb-03F(FyC^
zL|-?eLdtky(B;AMBiu{sj<H2yyE2Vtu~Cmb{|TlW__a(4OH-Wg(+U<5Pw3-Hk*=+!
zOOZuH$nq-IQT#YHw@69rREp`|6ArCf%N9zQR#pZ?9<5v6ZiQpA<NIQRaR@ubf@|S?
zx!FmQ;ju$#gK8isi?yL$H^!X`x`7M!AybdB{+bfVx8jrqtd$Mg&lQBz3e#o?y$!mf
z6k7?r1z2x@0qVNpN|dXaGJ2T()vOQs5iSHtrsR7E)MJw+E%lvMh;}+3@{=j3MF}02
z_NofHvod>m0~Wr`;M_47Af9r`fS?Rr6vegzl1r?`qv#py@}MQ9NB#WrZ3n_IbY~rF
zph1`AnhCFD5y*|kk~_!zB}dPe9+}n-!ML}Hl?h!nIeQi@l2yDaTf=a*Xq*#nLyPOS
z46ogM6PhwCzs+id3J8T1n52fQwVce;jj&sQ17HrrIwRylE?Km&U|~YlO3{#Mp=H5g
znQFGG<0MT%*jF2-RKnU?PT|%@aXr6usg)(KHd(vNup0<tn`z;qM9^QlA=%mBFg?)h
z=_%u8uxw*)xdoqPh3r^vuzQ9e9gCs{qRMJ#G-9Y*$1r9}x$Q-?IhqdpoiLW7kI=tP
z$}DPv-4siOTWuN<W`+#Nx)oY2VLWq%6$Xc_?24vTsDr9AsPJHnSs<${x~7EQDH~aB
zO>wu5+Gb<WHn9*&UC==V&&uOytqhn#ZLKctK^s{6t;{7;tXIqqupk+%W7xzE`g3x&
zx?Lqbpe)pd;7jHNb`Y$9qzzk$F&TO-H79|qEmdUEXw|o;>V=ttL7akwOe~hhbIU4e
z=}asqkyRrNtomR?L@$*p^?G{z!1X9)2{v%Haea`_3u74^z`~l+8Nvw*Y=vgEU5$BG
zStVSiXg*JTwnGY)<AN+$CUOgDz6=K4Cb+laDP|T*^!rY6BubZ4avZ0rqOHX?#?ygc
zn>-)Wa6?><mATfnh4)By5GzxvUiCYoo;_3hA+`eyuGU}$yEXiTPqLk3@yNx(=H?c4
z7nUTfctUiG>a|?;tj&+)WR1rt-kshr+?msDc#?-ySe^ct#c8;K^(rrHa&}O*_)c{f
zq3N+5r~AvT<ude4HXf~F4TQu9ak8u-Aq8wUOcs2A#q3aRIE#}A8Ve_nNyr_2FNq3-
zz%F))oYI|oIft2&EsWMsgqLU=mx7TctaGV|MOw31$w>x=8raUVa%?tP1hYeR25`_a
zSvOs?a%_io9o?2Q*<zUo^>P8$H0Evi6T5LaG>oEh)d<FXcFc2=<*_62spU@(o`<aR
zNoqPx%sI^mRPtjlQ@w5{g20>b53eRW`Ij)iN_{`1dGzy?#@`v_FEhW&i&T-a^L~Nx
z{vf}8pH5|ldMd*F??b|`aQC^^U+;%kxr7<{@wW3lD7BYE03QuI^*=%XRo=}4rH!PW
zwf_3PB?2Z{wEQtDs6636=0d|iLDa~f)9_Wrj{ka}yGqO-_o)9`j><hqmAm3KuJ^^O
z9QWBfeXRZyHuAL%siNlB`|DNe{q<VD=GXe~WBwI1sQ2NkJizvp{E{cke;+Vu6SZ$L
zh{%(1{~H<K$tUdpPa`pzKa=s=D#x^9G@|b*RqhY+k6-8&RjMAd<Bf)YR31b$S^h!h
zSDEDclAM}PrTm>CnP30Tpz<TvORmXB^C|Ku0?GW!x1jQMsz@2O-?x~*(@at!c_`bj
z_fh>nEdM)5kn%OZe!od&OA}}yEdN-L|C-0VqROZ6elq#!_)+XX5D*tR%sSs^9r+ip
z5$bDP<w>ZWnaHo-pB?*O=Kme?%GlNN^?P4?nO_@5)(?5KTy4iM0+aUZ<bPJQjM@8L
zFGnOX@o0XP@_ndee$%G%Q7@Ot7l{*3CJ~n48OSWQw0wP!y6iMxu)K#%KAB_$VY&BS
zH0KAn=6_p&d#gT=BuP9NA}&Xn<^LM5I^~hy`K`u3t+%c}GCb6G1Ut_*{#{~F7Wc%X
V<tsD`fvNI;v)60R3o-;z^FQuy3NZix

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/qgen.c b/data/ssb/dbgen/qgen.c
new file mode 100644
index 0000000..7931f8a
--- /dev/null
+++ b/data/ssb/dbgen/qgen.c
@@ -0,0 +1,469 @@
+/*
+ * Sccsid:     @(#)qgen.c	2.1.8.2
+ * qgen.c -- routines to convert query templates to executable query
+ *           text for TPC-H and TPC-R
+ */
+#define DECLARER
+
+#include <stdio.h>
+#include <string.h>
+#if (defined(_POSIX_)||!defined(WIN32))
+/*
+#include <unistd.h>
+*/
+#else
+#include "process.h"
+#endif /* WIN32 */
+#include <ctype.h>
+#include <time.h>
+#include "config.h"
+#include "dss.h"
+#include "tpcd.h"
+#include "permute.h"
+
+
+#define LINE_SIZE 512
+
+/*
+ * Function Protoypes
+ */
+void varsub PROTO((int qnum, int vnum, int flags));
+int strip_comments PROTO((char *line));
+void usage PROTO((void));
+int process_options PROTO((int cnt, char **args));
+int setup PROTO((void));
+void qsub PROTO((char *qtag, int flags));
+
+
+
+extern char *optarg;
+extern int optind;
+char **mk_ascdate(void);
+extern seed_t Seed[];
+
+char **asc_date;
+int snum = -1;
+char *prog;
+tdef tdefs = { NULL };
+long rndm;
+double flt_scale;
+distribution q13a, q13b;
+int qnum;
+
+
+/*
+ * FUNCTION strip_comments(line)
+ *
+ * remove all comments from 'line'; recognizes both {} and -- comments
+ */
+int
+strip_comments(char *line)
+{
+    static int in_comment = 0;
+    char *cp1, *cp2;
+
+    cp1 = line;
+    
+    while (1)   /* traverse the entire string */
+        {
+        if (in_comment)
+            {
+            if ((cp2 = strchr(cp1, '}')) != NULL) /* comment ends */
+                {
+                strcpy(cp1, cp2 + 1);
+                in_comment = 0;
+                continue;
+                }
+            else 
+                {
+                *cp1 = '\0';
+                break;
+                }
+            }
+        else    /* not in_comment */
+            {
+            if ((cp2 = strchr(cp1, '-')) != NULL)
+                {
+                if (*(cp2 + 1) == '-')  /* found a '--' comment */
+                    {
+                    *cp2 = '\0';
+                    break;
+                    }
+                }
+            if ((cp2 = strchr(cp1, '{')) != NULL) /* comment starts */
+                {
+                in_comment = 1;
+                *cp2 = ' ';
+                continue;
+                }
+            else break;
+            }
+        }
+    return(0);
+}
+
+/*
+ * FUNCTION qsub(char *qtag, int flags)
+ *
+ * based on the settings of flags, and the template file $QDIR/qtag.sql
+ * make the following substitutions to turn a query template into EQT
+ *
+ *  String      Converted to            Based on
+ *  ======      ============            ===========
+ *  first line  database <db_name>;      -n from command line
+ *  second line set explain on;         -x from command line
+ *   :<number>  parameter <number>
+ *  :k          set number
+ *  :o          output to outpath/qnum.snum    
+ *                                      -o from command line, SET_OUTPUT
+ *  :s          stream number
+ *  :b          BEGIN WORK;             -a from command line, START_TRAN
+ *  :e          COMMIT WORK;            -a from command line, END_TRAN
+ *  :q          query number
+ *  :n<number>                          sets rowcount to be returned
+ */
+void
+qsub(char *qtag, int flags)
+{
+static char *line = NULL,
+    *qpath = NULL;
+FILE *qfp;
+char *cptr,
+    *mark,
+    *qroot = NULL;
+
+    qnum = atoi(qtag);
+    if (line == NULL)
+        {
+        line = malloc(BUFSIZ);
+        qpath = malloc(BUFSIZ);
+        MALLOC_CHECK(line);
+        MALLOC_CHECK(qpath);
+        }
+
+    qroot = env_config(QDIR_TAG, QDIR_DFLT);
+    sprintf(qpath, "%s%c%s.sql", 
+		qroot, PATH_SEP, qtag);
+    qfp = fopen(qpath, "r");
+    OPEN_CHECK(qfp, qpath);
+
+    rowcnt = rowcnt_dflt[qnum];
+    varsub(qnum, 0, flags); /* set the variables */
+    if (flags & DFLT_NUM)
+        fprintf(ofp, SET_ROWCOUNT, rowcnt);
+    while (fgets(line, BUFSIZ, qfp) != NULL)
+        {
+        if (!(flags & COMMENT))
+            strip_comments(line);
+        mark = line;
+        while ((cptr = strchr(mark, VTAG)) != NULL)
+            {
+            *cptr = '\0';
+             cptr++;
+            fprintf(ofp,"%s", mark);
+            switch(*cptr)
+                {
+                case 'b':
+                case 'B':
+                    if (!(flags & ANSI))
+                        fprintf(ofp,"%s\n", START_TRAN);
+                    cptr++;
+                    break;
+                case 'c':
+                case 'C':
+                    if (flags & DBASE)
+                        fprintf(ofp, SET_DBASE, db_name);
+                    cptr++;
+                    break;
+                case 'e':
+                case 'E':
+                    if (!(flags & ANSI))
+                        fprintf(ofp,"%s\n", END_TRAN);
+                    cptr++;
+                    break;
+                case 'n':
+                case 'N':
+                    if (!(flags & DFLT_NUM))
+                        {
+                        rowcnt=atoi(++cptr);
+                        while (isdigit(*cptr) || *cptr == ' ') cptr++;
+                        fprintf(ofp, SET_ROWCOUNT, rowcnt);
+                        }
+                    continue;
+                case 'o':
+                case 'O':
+                    if (flags & OUTPUT)
+                        fprintf(ofp,"%s '%s/%s.%d'", SET_OUTPUT, osuff, 
+                            qtag, (snum < 0)?0:snum);
+                    cptr++;
+                    break;
+                case 'q':
+                case 'Q':
+                    fprintf(ofp,"%s", qtag);
+                    cptr++;
+                    break;
+                case 's':
+                case 'S':
+                    fprintf(ofp,"%d", (snum < 0)?0:snum);
+                    cptr++;
+                    break;
+                case 'X':
+                case 'x':
+                    if (flags & EXPLAIN)
+                        fprintf(ofp, "%s\n", GEN_QUERY_PLAN);
+                    cptr++;
+                    break;
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+                    varsub(qnum, atoi(cptr), flags & DFLT);
+                    while (isdigit(*++cptr));
+                    break;
+                default:
+		    fprintf(stderr, "-- unknown flag '%c%c' ignored\n", 
+                        VTAG, *cptr);
+		    cptr++;
+		    break;
+                }
+            mark=cptr;
+            }
+        fprintf(ofp,"%s", mark);
+        }
+    fclose(qfp);
+    fflush(stdout);
+    return;
+}
+
+void
+usage(void)
+{
+printf("%s Parameter Substitution (v. %d.%d.%d%s)\n", 
+          NAME, VERSION,RELEASE,
+            MODIFICATION,PATCH);
+printf("Copyright %s %s\n", TPC, C_DATES);
+printf("USAGE: %s <options> [ queries ]\n", prog);
+printf("Options:\n");
+printf("\t-a\t\t-- use ANSI semantics.\n");
+printf("\t-b <str>\t-- load distributions from <str>\n");
+printf("\t-c\t\t-- retain comments found in template.\n");
+printf("\t-d\t\t-- use default substitution values.\n");
+printf("\t-h\t\t-- print this usage summary.\n");
+printf("\t-i <str>\t-- use the contents of file <str> to begin a query.\n");
+printf("\t-l <str>\t-- log parameters to <str>.\n");
+printf("\t-n <str>\t-- connect to database <str>.\n");
+printf("\t-N\t\t-- use default rowcounts and ignore :n directive.\n");
+printf("\t-o <str>\t-- set the output file base path to <str>.\n");
+printf("\t-p <n>\t\t-- use the query permutation for stream <n>\n");
+printf("\t-r <n>\t\t-- seed the random number generator with <n>\n");
+printf("\t-s <n>\t\t-- base substitutions on an SF of <n>\n");
+printf("\t-v\t\t-- verbose.\n");
+printf("\t-t <str>\t-- use the contents of file <str> to complete a query\n");
+printf("\t-x\t\t-- enable SET EXPLAIN in each query.\n");
+}
+
+int
+process_options(int cnt, char **args)
+{
+    int flag;
+
+    while((flag = getopt(cnt, args, "ab:cdhi:n:Nl:o:p:r:s:t:vx")) != -1)
+        switch(flag)
+            {
+            case 'a':   /* use ANSI semantics */
+                flags |= ANSI;
+                break;
+			case 'b':               /* load distributions from named file */
+				d_path = (char *)malloc(strlen(optarg) + 1);
+				MALLOC_CHECK(d_path);
+				strcpy(d_path, optarg);
+				break;
+			case 'c':   /* retain comments in EQT */
+                flags |= COMMENT;
+                break;
+            case 'd':   /* use default substitution values */
+                flags |= DFLT;
+                break;
+            case 'h':   /* just generate the usage summary */
+                usage();
+                exit(0);
+                break;
+            case 'i':   /* set stream initialization file name */
+                ifile = malloc(strlen(optarg) + 1);
+                MALLOC_CHECK(ifile);
+                strcpy(ifile, optarg);
+                flags |= INIT;
+                break;
+            case 'l':   /* log parameter usages */
+                lfile = malloc(strlen(optarg) + 1);
+                MALLOC_CHECK(lfile);
+                strcpy(lfile, optarg);
+                flags |= LOG;
+                break;
+            case 'N':   /* use default rowcounts */
+                flags |= DFLT_NUM;
+                break;
+            case 'n':   /* set database name */
+                db_name = malloc(strlen(optarg) + 1);
+                MALLOC_CHECK(db_name);
+                strcpy(db_name, optarg);
+                flags |= DBASE;
+                break;
+            case 'o':   /* set the output path */
+                osuff = malloc(strlen(optarg) + 1);
+                MALLOC_CHECK(osuff);
+                strcpy(osuff, optarg);
+                flags |=OUTPUT;
+                break;
+            case 'p':   /* permutation for a given stream */
+                snum = atoi(optarg);
+                break;
+            case 'r':   /* set random number seed for parameter gen */
+                flags |= SEED;
+                rndm = atol(optarg);
+                break;
+            case 's':   /* scale of data set to run against */
+                flt_scale = atof(optarg);
+				if (scale > MAX_SCALE)
+					fprintf(stderr, "%s %5.0f %s\n%s\n",
+						"WARNING: Support for scale factors >",
+						MAX_SCALE,
+						"GB is still in development.",
+						"Data set integrity is not guaranteed.\n");
+                break;
+            case 't':   /* set termination file name */
+                tfile = malloc(strlen(optarg) + 1);
+                MALLOC_CHECK(tfile);
+                strcpy(tfile, optarg);
+                flags |= TERMINATE;
+                break;
+            case 'v':   /* verbose */
+                flags |= VERBOSE;
+                break;
+            case 'x':   /* set explain in the queries */
+                flags |= EXPLAIN;
+                break;
+            default:
+                printf("unknown option '%s' ignored\n", args[optind]);
+                usage();
+                exit(1);
+                break;
+            }
+    return(0);
+}
+
+int
+setup(void)
+{
+
+    asc_date = mk_ascdate();
+
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "p_cntr", &p_cntr_set);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "colors", &colors);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "p_types", &p_types_set);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "nations", &nations);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "nations2", &nations2);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "regions", &regions);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "o_oprio", 
+        &o_priority_set);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "instruct", 
+        &l_instruct_set);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "smode", &l_smode_set);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "category", 
+        &l_category_set);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "rflag", &l_rflag_set);
+    read_dist(env_config(DIST_TAG, DIST_DFLT), "msegmnt", &c_mseg_set);
+	read_dist(env_config(DIST_TAG, DIST_DFLT), "Q13a", &q13a);
+	read_dist(env_config(DIST_TAG, DIST_DFLT), "Q13b", &q13b);
+
+    return(0);
+}
+
+
+main(int ac, char **av)
+{
+    int i;
+    FILE *ifp;
+    char line[LINE_SIZE];
+
+    prog = av[0];
+    flt_scale = (double)1.0;
+    flags = 0;
+	d_path = NULL;
+    process_options(ac, av);
+    if (flags & VERBOSE)
+        fprintf(ofp, 
+	    "-- TPC %s Parameter Substitution (Version %d.%d.%d%s)\n",
+            NAME, VERSION, RELEASE, MODIFICATION, PATCH);
+
+    setup();
+
+    if (!(flags & DFLT))        /* perturb the RNG */
+	    {
+	    if (!(flags & SEED))
+                rndm = (long)((unsigned)time(NULL) * DSS_PROC);
+		if (rndm < 0)
+			rndm += 2147483647;
+		Seed[0].value = rndm;
+		for (i=1; i <= QUERIES_PER_SET; i++)
+			{
+			Seed[0].value = NextRand(Seed[0].value);
+			Seed[i].value = Seed[0].value;
+			}
+		printf("-- using %ld as a seed to the RNG\n", rndm);
+		}
+    else
+        printf("-- using default substitutions\n");
+    
+    if (flags & INIT)           /* init stream with ifile */
+        {
+        ifp = fopen(ifile, "r");
+	OPEN_CHECK(ifp, ifile);
+        while (fgets(line, LINE_SIZE, ifp) != NULL)
+            fprintf(stdout, "%s", line);
+        }
+
+    if (snum >= 0)
+        if (optind < ac)
+            for (i=optind; i < ac; i++)
+                {
+                char qname[10];
+                sprintf(qname, "%d", SEQUENCE(snum, atoi(av[i])));
+                qsub(qname, flags);
+                }
+        else
+            for (i=1; i <= QUERIES_PER_SET; i++)
+                {
+                char qname[10];
+                sprintf(qname, "%d", SEQUENCE(snum, i));
+                qsub(qname, flags);
+                }
+    else
+        if (optind < ac)
+            for (i=optind; i < ac; i++)
+                qsub(av[i], flags);   
+        else
+            for (i=1; i <= QUERIES_PER_SET; i++)
+                {
+                char qname[10];
+                sprintf(qname, "%d", i);
+                qsub(qname, flags);
+                }
+    
+    if (flags & TERMINATE)      /* terminate stream with tfile */
+        {
+        ifp = fopen(tfile, "r");
+        if (ifp == NULL)
+	OPEN_CHECK(ifp, tfile);
+        while (fgets(line, LINE_SIZE, ifp) != NULL)
+            fprintf(stdout, "%s", line);
+        }
+
+    return(0);
+}
+
diff --git a/data/ssb/dbgen/qgen.o b/data/ssb/dbgen/qgen.o
new file mode 100644
index 0000000000000000000000000000000000000000..ff21d7d38bed3b98af34b0177c8983e8ecf347e1
GIT binary patch
literal 33552
zcmeHweSDPVneH11X+kwY7cH${(>AQ2CKA3@kYWac3^hU|C|c1l%uJGzWG2oBLaP=H
zs1c%e*Pd$i*Lqgl`m=6jyIqUcVhZBN*0a*qT30<=YP&}wRFPWi*UGu?=e@7w;xS{j
z=bv-VZ{MGp_kFMTey;nv?&o<wW}X@R@`C0wN0pTYSjvLA!SEi?Ah_wu;q6AZ*%%xd
z91;AX=J>Z;V}0-Re$Ih~>D#W5jac8W_vn6X<3MrjqoaeZx^v-<Sbt9$4m~zTF<tiV
z*j6Vi9FR7JiuG&D>I$FU_9YMhVc1FqJ{m;!UjL}HzEdScmq*Wwo*P|uZfmS}--KA-
zoZ4vLA0HemaoglCaANvg#+WIs*>Im6i1p9V<FS4x--Rve8xW4lgN544=mV#^gNM7A
z$F?7oWt%3CZF2F>R%q0CLfm*Bi}gM-PVCfdcuv}chhm9e#rmC)P31R*ikkXf9v0cu
zw`XF|7zBMywUsp+|EIKT?w=^H!QkrALGW-8F*C8g9kH-EW1+*1&lgV9v>T5Xz0s|g
zOY2yF=%N28<zf#`>%(fwZAG`bnwJb8woGjvxLTSwhqhvyN@W_U>4yj&eCNG`2d&^v
z;^4KTf*|^*oq?<hRk8lnqXv6r(=0ak9bkLIN%#IT`o_-6V%T7*Huw<9_x|kAM4$Rl
ziFz-utqf{5{$8pY>)kPGcpPGkOYa^Q_MUST+f=!?Tq4VA?jIei*|zJArgO?Fi?7Uy
z6kgucTzgpT{k=A9Yw%jhBF;vcv$ZbizRFE*ENmQjP&>RPw!NiUsKzyiaOvSNOss=<
z@4c~2;R&*7PHiYv%ROFlgvHzVglkX}dwc|u!wr5b8bo8cX?$9C|9p2z&z5oNUUhtO
zw)DPO*7qo#X;#B=vEH3!QoaSj8%^`1e*5=k7u0MUcwn~6_P&8nPQ}yS`Ec47=4tMG
zds`6EQ8IjK&`Z-w9T1gL(hZ&espC>FT{8AvmA1K5MxbY?#NREKKaaYp=KlF*(f-D#
z<}9qa_5wLaYwjOdlqlcOT(*1e!QIV$2h9*A?tNs!g%?FvL@$b79Btd7S3J2!jk7CB
zOY{L15p|9pkiL=PI=gmtb7v~s<ukNI%%%?{AZ2vr<g4iaGuu*GS5MqOqJ8hi`u^D5
z@A@^?f1Fmfx$osz-z!ahFAW#5_sxSf8{U;ma_^qSy>FgWbM3>jEq9JYd2g)jS?R{B
zT#@w}wRfO7Jc*k7{uK6X|Jt%}qwg`hi0eoh+uZktgZJJjU8$$>N4g4I!b-VSG|QOk
zb@|H%8)e(EKCDjM)`yex3CJ<xaYFnDxfDmY)_i1bA1>o~WDK=84sLMij{3;yOmU+q
zjta$Jl)ZAW!eu}HBWM4M(v2GYnp{qzTWddZI=V#Cr7FC;49*i$t44Vcj2vo1aPVk}
zbq^wZ$vaSUgu9Wn*StD#AXamd?VRe?HP^iOu9P<%rgr~7P&3XQ`+&hK7Q=x!c&9f1
zP$!2!8Y9xTDQO+uI_o1>EF50r1}94UaJzi)ld>I4gvvK9+bQRsyP|uS&hSJU((f(@
zxGuY^h<pC5bYVEN?Ml-3id{_b3=or(|B%bHwR+{h0%Uy1_4fb%I=KFq=cLLDFADD=
z!|y__+BZoq{c<x|)c40^Y*YEfNzT-U+6e+T48%57V0_pUS$M(@zblFjZ@Pi;h8vsX
zYBnZh_&FqOcz0RN#vcjv#v|o58?M)L!3Z}FtUEk5My~!}9x)R}WB+#t^qH0}9SH9m
zEsvy;*sQ(9-^4!XabQT<0?CedSJ$l*)=){)tlc&J-&19YU5om=ei=k3KN9V4sy!Gh
z+jVyT<vT0w>8ZItSzBQ@Uh|N6HkN+=jEa{Mv7fwCd+X$1^pC%-fBY{d@9NvZm$ECu
zdi3roi`^1SJRg%N`D=~!m$#g~sr=FAzQHhBB*?9tZsNG|ycC{#y&`4Se-?WOuU4_)
z+nJV20z6%5^AEVH$<a`UJ~$74L2e$ycU*ZE9Da1Dxo?L&DA)zML*n>ZY4Cdva2KCL
zJ7s6Ae?_gnoh+Ob(|3&8Yr}7|Yc^c14MRu3_dXMSV1vm1i{6_3NDU1=2X$%JOjIbA
z_$hj4S1kSWuy@|B?R)em@0~FDK>zf=?H_+&@-FF#{_*Rz3*;|)_NJztQu^Nv_d~4j
z)!~Xjb@=2h?}K{ZIlB0@a9l-dHvW@l-4vcW$=J!^xm2?;BF@4y@Ty1jWW$A69*~Cy
z-W3gQ(5*OR(8(a&S!p>poo;=<qf$ru;NCtxnZ$SKwO@Qk`=0$s(mzpr+&y*SvxP%W
zt)X5lem?rZU4Locc>O%WcqqOw_=F4yXHI*q)FRi1gQbosuE%vr%C>%-67IeKb^He$
zLwmGRF5Q+@P^~YrNdLK|%GS!KOGUVnPnyEzNLyFh+PzJY?eJ+k2;?c;El1m0o?6`^
zPd{$?xUFk!t(~oldeh~v!d2j|wzB;(+ds_K$cH@8gLv?x9rVKwe1H%7pa(vn2X%p8
z)C)RMFZhKY_(gu?MSJ805AuNzd7%sW;S>4b1Ad_&b@BC}4yYgMg}VBAO8G!O@FO2|
z!6*E{AN(LM^uRB4pf2!(dciOBqaJ97yl9Vns6X1HU*Hcq;UD>+6Z+r-e&HWFQ5Vz!
zI^h#Kp$qw;2kns`K9CRozz==Mk9^Pt-KZ1%K@aMI`oJIRfqn2oH}XL@c+n0%;14>X
z8-9=<dZ82kp&!0c59o(()D^tYgE$-qA9O(vc%TD%;Rkt92l$6x_(i>-AN7HL)DOPF
zi@Z1vJ>Z8Q=s|n%B0qdVC;a<*;2V132lYX{P*2nYI{bL#fo|l7|5AML4;}CiAJB>V
zz$g5p9;gHA<@*IM{6Hsoe16mc`QaBnpci?e1HO?Tb%9UR5&lq5=!Q=4`g)NUyvPGy
z_y8~Rzz6ccH}pb3{J}qbqaWb|{eXPfhaS`y`jH=b!3Ukl4<FDC9ncNm&<TC03-rMk
z>WI3c9rQsr;^7B;&<p+037_x*-OvmD&<!2%3m?#rIzcCVfDd`W2Y$4JZt#K!I(@&W
z1M)*3@<Jzk!yoE`dZ9k31M)#H@`4X}d_JEC`uzTZFVq40;2U*-FZe+nOVtg2-~;?<
z2Vc;Syzqzo;D>+I3x1&=^?+~aM1Mj*{G%Sw2|dsWe#9X^^dm3yAP@8-4|G8f{6Y`v
z3!kV1{G)#Gh4zRC4|GE>^dmob;1@dK2l=1}df^ZLp#y$V2l$14)Dt?<4#(jGy1|2X
z&;|X_0o~98zsL`N@QeCDFX{rn@Co0@4?WNc9{7O|@PY?Azz5&Z4ZrXM9nkIjf?m`E
zzEBsmhfdT3dEo>3p&xqT3wn?b`cV()hF|!E4%7qsp%cEKAAHb@{5TH%z8>g?F64tA
z_=PX%^>zBXP#@G0`cW_NBM!%*A9<l4^*|k=6TaXdKA;Erpd0?-2mYZGbwhpO6Kmk$
zfnKzO9{7U}@FO4m!WaCY9`Fr6(2u%6Km4ICr~~q&J@ml`;(Q;-3x4E<FXZv{!Uy!g
zFX{!~(2u&H4$z1E;75Msg%0qe{_q7q$OnG-L0ym^`cNnMKtDh)e4%d03mm%89zKvC
z`JfBF;2VD71NnV@&<`D`3w%Q_>I2_6jy&*-c<2B>{D233;2Zf%>4i_!2Y#Uk^+3H)
zC*O~M9QvReJn(}&&<#H5fPd&mec>NIQGe6{zM&t!;S>9Q9_T?H^aJw25B$Ri^7%TD
zANo-T)Cu)~Z`1)g;SX!%g-_^2JbXbHc#$9ekO%tU3%XDj_(mO|A3mWU{?T7Jj(G4w
zC*r^leaH(v@CRSekG$}Sy1*~$1KrT)`$k=%7dn8054wCEJ}>+sKlDQfbozBc9pDT8
z(Lbmc{9=uE(20D$4)8)R{DK$xpwIUS{iq-Gqh6>F`Wd=WN908u@`4ZTz=wWDUf(Zx
zQD^9Ze)xwk)CIm!59mZaP)G1UFWP|@`cQx9KtAvwFZ{v>^20CcfI2}B{G&eb4ZqM2
z9cT}I$cs4Wf<NehZsbKi<oErfF3=C3@CSX+i~7JP*3gN%LO*nZ4?2+#`rr@xpaXfK
z1Nl)0=tte)6aG;T)Cs)E1D%M6KKMaC_yiyFz%Tqj7yLpO^20B5qdxG5I>0yDLm%3K
zAMxOUKj?x_=!akU@^v9Ee8Ly%0^iV!I>EP1!ws!2?MP+n6T#W>uC8n%(jHHDrIL|&
zAu=&Pdt!2IP#-k4wzi#f?t-Nk1QYWU6BG0G`AfTkT(G1kl|guWHYf4as;=&{tqYb#
zmM&S|+IHrGW%FYZ<r0Fi!TcqQ7cZE<EV68gHaT5^v(H|%EV6vb(z8yNh{%Z(^QTId
ziOCa#iOHa~b>7*LNv(x=F4CIlOm)X2^HP~aXLmfedUDV(Yu5BgU1aK%DN};^*`9T|
zbVsKum;c6!z(P8k$<Gd|>f%*Zb#;+qJ{5^BZe0|~3r(hwPUP$5*cx|iO)9r4n@`mT
z3+F}B`AEKy?&^x9Gm&I!O{y!~)1Ar`>Q&st8TC`zm0mLxGpjS%%QBIy%cR0jjHEj<
z*<4D@##hZwBs<fyGqV?W&Cbs5nVp-RpIw-}W^Is6=L`AzWIpdo+q9^4S<us#$P{ux
zBHNYC<z=t1t|yfbGI8es>#0F5)uEPxY+JS`m(B+1OumpSCJI5mJDW@eiFhH^k<G0O
za_wF5j-WfA>gdiCf^!<C#r0*C>vQdw`R6WOx?u6L$b$1*nxl&rM_Lz0TUujFmPM8<
zJ{<!?3ugY5P243WCdi)@qa~h;cc%&|>6+pyDOsVYdLxt8)Ju0u2g-jF^OJ2rF3ZI;
z`FO%bwB)jhR6d{1bVOQGIT=0DABj|CezurNq`N`|=e9-{E|{&IIVY@des1K#$fd<p
zE}hCpF2XRE9$XcfBW0MYUEGz8CnH+lTzZuglaI9LvfW`o$XivHa6O(&72?vtiEOua
zuEa_5q;`2B)!ox2m8?g6a%l7<Q|<9$S0R#TyKCZI#gq&sTdL|hUA%NqrVuG~%4jO)
z;~gn!)GecS9RjQB(!;uSXcjtCq9#*tYO?K-b~zKnh)5wDStVVciNsw297k1M*Kqr;
zY)7OAy^+^=R}#%tRhJo#6aGvpA+oZOWV{ex6_>mgSyi|AeLi#9%Mwx+sZLyL<A!!*
zwwx8|oUo+VXpO4sESc6npHj=INVZt$DHcKtE=f<k(0Pa~Y1b2(lbJh|Q=4h;u8Z`f
za^1y(8~f5LIXM|~skroEX0DdFsxCLwE}xQXl{S>Im6UGI6uVc+Xp}2jDi<$E)63G5
zHf&f`CnL%7IZ51!`G}0$cqY<%W<)CQ8dTL4hU@m{`$xu6PnVnv=pxa%)_F^1;;W=1
z^|Zt3CLNiICpw4wQhhZ^f4KU|ut{~~(uH+;A<AS6k&dGDtbnOxy^Q+h(WQ$QEnYZV
z&bgkRY)*=-B}&94N4uO}a{Y+7I!SLVYniX-PpQ-Nyi_i)=j);8s<bP*Gjn2>+y|uR
zMU+-K>k7Se@j^TEhT=v#De~&*zn1^+D$saiW9VuB8NtH&^JhmUox4h|55-7BL;dvn
zDRm7+x7%>#)T#ATrcXB5HPPoCeOA%uk%f!T)$c0}D2C@DJpYt^acNMtc3jyphkdN_
zCV7aOATh_vpDw-nvY>ig@2HNk)s@kYuaMo-6qDtElAT#S?#5B`swdnqdR}#;zkFWx
zq|IZZ)l;sih*qE0`?03#mQhXB=QmZalvT^<wWF%fKc{+SK{hv+Ri7qt(dtRkPTJ3_
z9yhPL(&fIzu?InmKG4OgC&a2Fqt7U-uGG9guw!Rh?i)0BzvkXtE?n1W?%s;gQ_HI-
zMVWV!wA*iO-&0=kbj_$}^@QHhO+x=#S@ncy^|+`g2m<wZtUXNL_UAn=tlnKddPiCH
z9x=bayr6r9vaut%i{gpYgm^1WSsuJL;&<CdkN68m?MoGS=%OCV1i#J>Awr*T6<==v
zp1ZZL6<_P&$~t=J+hn>IF12noCO83rttT7)X%7$Yzhi=vJ$#zQ&-CzceBmj2_@Mg*
zkBnwKzvIX}<H5KDzR>V<0oc0P@YNoEj^RBXzTEJuJp3ZVZ}9Mh;kSEu+U?_A!|=hZ
z;W|cnDY~68fjyoL9r~i--yA~1jrDF-5yU+F8n-_tc*w&yTl`uN|67ZH+{16N_`5y)
z8y5d>9)6eMFM9YrZoeYf<>6Zl_lNrZhEK6`63s$C6~Rgm-)`~Od3fljB6!@x|JCC6
zd-&6a>(}3TdCu?_58rM0bsqk|4A(EM^RmzIH$D7-!^Z^gaj<N^Y4MYA0k`8}Tt(37
z;pNWHnBaWR*I2iYce%q!#yPwqxW(f++IaLgRJ?@O$ckXVi~o$ppX|j?GW>K8uQxpB
z;nNMj+{0%ZzR|-Q4d3kHXBz%@9^P#DZ63bV@H;&`tmByAUJt+6;(y@bNy8uV@JkHe
z>ES(w|J1|R82+?}hjkefJm=y1Qvki|zv$uN`7kDU$-{4O`&If2(L<J-9bOSc?7o15
zUpIV@*Y9^29`oYAE%=diQui9sKMOt%q_oz0jfC$o9<+DQAJR_G=aJ%{68s3cuVKNK
z)^VixUzfo3H_;=-|H1K(4kDIdg{==dygV4NedMi>^L!+p!%N`fOW?<pz$cf$>r3D>
z1lRiDx{Ch)T#5LzT>NmS!<SwMm1mFn1<!dUcvh6aSCzoiCGZ}>bzBwgQK3cZuFIMd
z@%oE<)wv}M8yfG+OT_DST={p}E!#bFO50mX#DCNHUpIc#=UXM>?=6A<lkrS2Kd9$}
zhVSz9>wQIiJ?r6*8s6gJKNWnWarkqKk9qOW7=Dwd=LN$b_u^kH!9OUt*7M``80?<I
zrR}RF;@>v@tsehq8G`B;Z)7Ie@oK?G@-@NY8*S(!{shA#9)7anE3E?&Kh5w)51#`b
z4{tJDrRoBnR>OD8n(<YJ4|sUS@H;&G3d7fXc%R`bJ^W_Fr-W&T^xSGVY6qR)5M2AK
z%06&+?}eo8oq{9QsNCxA68Jwl{?WmD3!ZXpxbfo>@!Lz_Pn5u)bNuDOO5<;_^=`v=
z+rD)}@RH&CJ^c5EziBwmy90(xDck~{GTG2^7(Qp|d$(};q~Vob{HcZ?=HX`;KF-6J
z7#{KP^9`@{@KuI~&soEIIt)L}i_aQ9$HOl(ywPyf;Y!0}9)7Lixc(x(-*7yC0{^Pv
zD?Ogu4e#{uZyDa>;rAN8*2BMV_<9e2(D3UFmtD8~#Be+}0)N8rJ1idO#m@{M@bIS%
z-|gYQG8{h`1J7>_kH~dS7vS$1-ss`48otuQ-!*)_hnLHJQOD;U9$sVkfQKJt_-+pm
zY{Dv?X|vDn>!$`?=}fA=A-GgOX0D$S)HlzcsxQ;@WxBr1&=+^0LAPe=%irkBY5LL-
z9&v{nT+0U6w86D)aE)iVK-aLrwQLBRI(%xF(-BN{1XCTsR7Wt?5lnS$r@FRNUE8Uy
z?KIbRnrk~Pl<nG1b8V-&w$ohOX|C-w*LHe{JBI0wVY*|O?ii+r&K<*a*LJ#VJ3Vac
zTFr2+X1G=}T&o$b)eP5ahU1^%+RkuoXSlXAT-%wh?M&BprfWOXwVLS|X3hw721X{e
zY;r|+6P-D?>RA_DDj&V;b|KppTq>X02dT^&iOsa9JA$^hyqh^`Z%cHp4%)La%NXc1
zM5fS|Z0{=AcCaR%Q<82sYbjjGR4ylN+o^48ZCWPA3Q3uZ2$HMXGBQ;q5ec1IYg^SO
zlbu0cb7%9#_V%D%rYrJ6dq+xN6J0VD7__%{74w~eo4k>R#N3n1b_DXGb67GH7f#Yi
zQ`ro3u1qHAdSt?~Lp^qh{$!hu%OI^YIzg9v&1I6^VF{Abq|oL*zz+)Um@;OPfy|{8
zdxGxOZSj0UCTmiGtlQj1I9u2z0Wu>PPqyiVmp6Cl4t~J=p}5KF9D|GNYQsdL?Yiki
z+s34#?YgN&x1DIy2}ZYhsm?IUi*`sl6ST^NL9jTrwy;#DWdfSc2?}ZH1gTU{I;lNf
zh$mKSKey?$b0D4Ap4XztY)_Ea8POoWI(1o)OzFgmB<j$aI-T@P<qB<j%BN+n&d0Mk
z84LL#^w7q6v^KK545Y5KE2>me8l{q|Ak`xxGi|v{hg<7}ce{?Ce30l&cO`Sur!sja
zomuSBwt1P-)d?}}gJO3ko(npqTT(e0zPUsyu(@UFE13dJ$*NtpI)e_GJe3K-U`<bu
z=?UV+wKC-#*U8O5W>mY=(s^z!IG5_l=F{OAizk&UAH<WFg!6AQM=bMm0_0>myGxo5
z{|;9#oARg-^vd;EmrsQ&_grw6Y|B5_;g%VM>t~U`F%b-&^~ST9;+dzt1b(UEKF@b3
zUOAEXA1Hna;rOf_JYO<B+bLe}>EL;SaQ$=%c=-8R_&lP|X28SG#lqiF>l7F8w}?mU
zS+KmL>AtO>)gnIpJS*hUw-&&Uuz2VPPxyIOsAnm~pF;6mhna-8Qv4Z&FC)C!aFwl}
zJws25;`K9Z;0205kMJugUh8nV)q$U1IQ~X29)9m3{4nDGN5b_HA9{XFxPJBxe7oT)
z<9x!OB>V!x|BZM!FP@LIUpVhRis!s<5I^S)lt32M!+G`d6kT-1dG+%YU3A6cN<UB0
zMOW4G2frs0&UzXQ_v>&v;jAZ0JUmWfgzI;ikarp3Um*Nq;^#VCLh(FKvK0S$;_0V&
zeIJLsw-e5JzfC-xcN@iDNIW|U*Y9S5|7V0V|MMmI-=KK??iF~(s7YD$F!R(H?)S@7
z!uh(V@7r}z9`2VY#p`E!$lF2j++TUZS&zOy*TvU!8^!A!68OJQ_{D@jKsfi;0O5Z{
z@lO)YdY&=d*Yn#F@%t#gjriZ8c-DWI63e3X94~+HHGy!}bDZJ6o;ehM6!DxzIP;ua
zf~T8s?sweBR3~5Oalh4OeBItiJUkxmF<g0$CY|>a&ex3r;^Fh?e^5O8`X$A4{eMUC
z+7-Aj3{w0`(lg3FA61<kUq$#QDE?@}m5!e`CQ&@=oI>%e^E|@i#D7r<o>dgTisIK(
zJdgid37<gmcM$$b!tW#gB;of{{HF;2G2zD${s`gvQ5MGeFG}z~NAW3&{|({C5x&0!
z&z~s%c#40Q;`NyjdPdvl*4pnU5MF7x_AkfRP<({q$CrpdhT=a>@sSeoCsO=pDE^cZ
z@zW^&vlM??iTE=pzLw&fO2nT<@e?V&r9}L4ivMehUr{1HLGdS2d}oRHEX7Zv_(F;J
zFH!ttieFzM{#uGZiQ+exi2pl^KbhihDG~ooia&+o?<x`hU5c-x_$?*ke@OB56#r0(
z_~$78REqx%;rbkYtUX`8PCT6ye~bpnqUTXs{@%3ragzwYL~xvUQw-PhYc=7s2<PYa
z`NY#j@hueJOn4i`^Yd*x;rv{kBb@I?mlHqVkFFt{pJO)@57*%~4}Zt%^KIfukp3-%
zGye~XhwJ$;#dAF$C!Fi~bK>cyyuYS+uIGPHJo|cwaIW(x`ygA#1J~Joc}y_Hxy}<T
zUdIF1`7?xb9VQb$*P(&pxel`l=Q=bIPeyWMe4az`T!-ZpuXR8_UP3t6;Znl64p$J)
zb-31WzYbp^oa^v4;^BVz4&hv%`-q3@vz2hJ&vxSB_b*RTJlE%!6wme9LpayxRl>PG
z?-0)Q8B--im&LD74dGm$qYc;b#&tNJ;<*kd63%sKARd0-F_+@G4owuFmfRR8%L(T?
zTueCE;S$2R4h6&gI$TLO*WpIu;X2$*@%;S%b&BWrFW(|OExvG_e~@tIf7o!<$^4H}
zd{&O(y!#d5%<}^A^icfoC|*C)1J7%O=Lmm`@F|2>jFlZ(beuHEA9xNYd@A8b5zaiv
z60YCf1kYy)pHBFxgmeGSGTiUq1r&b?@tjTZJf6=ZoX7J@;?d7%p|hLfc|6PSirk|0
zNy{Jly^nAn&o>dy<N0>N+3&Xv_x)}moX7M1#G`s}er+#-KSg*!;^Fss!u2y};4d4l
zdXj{{N;u!I-y)oK9{q7?EQ|82kw5SsXSmk$GQw*KpGkN<;eSK;OyX}LypiJ9626e|
z(+F=So^^yTC!G1?#Pdap?<9N{;oZc;{A&qk{;P@SOT^Ph_-w*&ARgwwm2l?2lXxyC
zo_h#Co$&jJhxs2QocSLio-2swr-aWTd>8RB|MP@1|9=qAmBh1;@Xrzc8u2jyyM!};
zg`Ui^==l7y{9)W4PWW8Hk2GAzIrAS!IP;%KJnM<4p71jWpGG{)KbLUkj}gyR#IuC(
zM#7g75A(MX&irZOxte%-2#*q8Bp&8pPdM{mPdvTE^A*D95&qwZhxz|I;mrSC;@Lnv
z-zR)N;XfoE=HE^@^FK*E8;R!^gf|iXEb%b^ZwY7q-xJR@#Pd4g3kZLUc$oildUKLR
z$9YQrFwUC@Ka=pY2<PXGa|q}A;|jvNga`M{l;L{5bP)e)is$><WfWhe_$$14=>IC=
z{CxGD68QH@;Qvhcb&^;0>hh3>*M@NLFCLEgrwrHqKKX;rXG+8m63+Vf6TXmm4ibNo
z@G;f0Ba8O0))RS;BwU|w@%(TC;ryJiknrn?r<HL2JpTg1`T6-;!?g~JiRS^r&msI_
z;$i)dQT*8y{{qF2Bm4m269|8c@EGA^YGg+its8%iU1PZFKjeKn;hW?b>U=8Un+ZRi
z__KsBBzzI!&BW7A`1usCw=(2yBfOjNB=PXP_Em(hr1-B857*&Vis#Ss@1}UY<Ur4T
zgtMOe3D>Sb{C2|mIr&M${rW#c_yxrCBJuFJeT8ryw|^p>$8A&xlPsE-$LB)B{qt)H
z#q;>Qs6_lNg!6g#Bf@$7?<m3l9L2M*cPXCrk2=EHdf)Y7tl{dH^&e01Jl-Y}&hMvA
zA|4)ZO%%`L<SfFOXDQ)4PA(>#$4RH*{y51I&f{bq@$fjmisE^ke}(XsQX`DpI|%P5
zTsGaJ<3#^1RAI8%%3+GP8DJm(4B-jGwN|=-N58kLi*@f1zS?jd1G*qyzt5|SUITRj
zo;O_Qs&oOa-}BWa;$DYw%#YT3_;!n*<l&DReu{_xi{Vo|{9g^9?%_W-T;IFt0-eto
zuJ0*y0sfrfjSd~g|K0E=4}VA-%A)rIUBL6Y;msjDguiS&Egs%5Is}$^IDU_$_YPga
z|Az6b2;m|8ZNpc3xPJdvmt?paipM-|r-xUJ34tyTA8UAzhgTb3@NoUzfG%sp)llB|
z48PpNKV>}YJ$$9<-{9e&vH0se{3?sz?BU`6xJ&*&t{z@*JU4szRKsua@R^3+?%^1>
zcX;?*i@(dm=NW#tho5Qqy&nE~!?$?&V#9yn;Y$tQ>fxA2f5^kXVDSSUezD<?csRcA
z^0<c=tbd>M@Jozmmxt@`IdpkCTn&wroW(!u;cE<k!Nb33_-+rs((vDT_|=B*@$hR5
z-{;}i8@}JeZ!r9I53jOu_@;;J?;dn{H(U+*{kri4Hh+loeYeHqcOk(4-r_60_%g!}
z^YD8N$L~VG^F71Id-2~l9KRbwJigz6c~0OMw-Jx$tCgWq{JsS7yX<qoNnZTZhM(f$
z&l*0(!(T9bx`*#J9KTaR-rpIH->CrKV|b&-v(NA*58rQi%)?(dyxGIwG`z*b-!*)h
zhX?j~6MpXko#lqF@Zu{CU+Lk88J_g;afWw#_;|y+JUnIdmpvX{G#uYMh0cFB9KWXl
z{+i*JdpwgY?|KhE#qbRtKE?3sJp6DQZ<{?lV)#uSKF4@&_HcYIe2a&F!+37@@R;%3
z;o-Ga;rPGH!@p+u-5x&H#>2fHe!lT<@$hLD{{s(C7{1lRpELX+569=x10KH2@;>6>
zJ%&H-;oymQ*Eib@=6HCo?ceU<_#Li)|HAKQ{p&4$2O9D8;P-|>eSTfHeCuD<^2-&s
z?!;Cu)fKNV$PXTZdigy-Jg8qK|F~OyE~`5l-XOnCI%HRVO5@suAw$O!`GOp=U(E#d
z@{d2I>gC_csdGP+u>Foqv0i?ABEPCAtTUEWXIs1eOKd37|Is=kt+fR9^^+m4M%%i}
z-gn@Dl?%Ipem6~TIgLX|xN*~B18T#yzWvp<b~+5bPxx!Y?Y15U>+6x>=9XiJA>E1`
zx*MD#`^xFJAE*xlH{1OX%_7*ab%U%`m(D@*g7*77!`Q{TQ5ucZe#?}Q4ez7C4gYAr
z%-ZYqj+Y5KX(r2)2n=_J-q+c#`wr=d_0NS7kK6gH_18-&@=q{=6|&Y8R&GP(|IDb9
zWU`D4S3|F8kF|bwpzY1>P~4l=Ui*g^*uh$5Vw>Y?buw;*#gDyOV$_#kze(0U;&DJX
z;IGSua{BFK)*kmG#QW{PDhEcY|M^#hEpD=gTK~i45A9L^k#cL3fz$Lg2)0-U=<4(G
p=eO6}f<N{J3VIglYY-f7&t>7>`)jSI_ekycnEt!a(AIwa|2HA8)Bykh

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/rnd.c b/data/ssb/dbgen/rnd.c
new file mode 100644
index 0000000..a159446
--- /dev/null
+++ b/data/ssb/dbgen/rnd.c
@@ -0,0 +1,262 @@
+/* @(#)rnd.c	2.1.8.2 
+ *
+ * 
+ * RANDOM.C -- Implements Park & Miller's "Minimum Standard" RNG
+ * 
+ * (Reference:  CACM, Oct 1988, pp 1192-1201)
+ * 
+ * NextRand:  Computes next random integer
+ * UnifInt:   Yields an long uniformly distributed between given bounds 
+ * UnifReal: ields a real uniformly distributed between given bounds   
+ * Exponential: Yields a real exponentially distributed with given mean
+ * 
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <math.h>
+#include "dss.h"
+#include "rnd.h" 
+
+char *env_config PROTO((char *tag, char *dflt));
+void NthElement(long, long *);
+
+void
+dss_random(long *tgt, long lower, long upper, long stream)
+{
+	*tgt = UnifInt((long)lower, (long)upper, (long)stream);
+	Seed[stream].usage += 1;
+
+	return;
+}
+
+void
+row_start(int t)	\
+{
+	int i;
+	for (i=0; i <= MAX_STREAM; i++) 
+		Seed[i].usage = 0 ; 
+	
+	return;
+}
+
+void
+row_stop(int t)	\
+	{ 
+	int i;
+	
+	/* need to allow for handling the master and detail together */
+	if (t == ORDER_LINE)
+		t = ORDER;
+	if (t == PART_PSUPP)
+		t = PART;
+	
+	for (i=0; i <= MAX_STREAM; i++)
+		if ((Seed[i].table == t) || (Seed[i].table == tdefs[t].child))
+			{ 
+			if (set_seeds && (Seed[i].usage > Seed[i].boundary))
+				{
+				fprintf(stderr, "\nSEED CHANGE: seed[%d].usage = %d\n", 
+					i, Seed[i].usage); 
+				Seed[i].boundary = Seed[i].usage;
+				} 
+			else 
+				{
+				NthElement((Seed[i].boundary - Seed[i].usage), &Seed[i].value);
+				}
+			} 
+		return;
+	}
+
+void
+dump_seeds(int tbl)
+{
+	int i;
+
+	for (i=0; i <= MAX_STREAM; i++)
+		if (Seed[i].table == tbl)
+			printf("%d:\t%ld\n", i, Seed[i].value);
+	return;
+}
+
+/******************************************************************
+
+   NextRand:  Computes next random integer
+
+*******************************************************************/
+
+/*
+ * long NextRand( long nSeed )
+ */
+long
+NextRand(long nSeed)
+
+/*
+ * nSeed is the previous random number; the returned value is the 
+ * next random number. The routine generates all numbers in the 
+ * range 1 .. nM-1.
+ */
+
+{
+
+    /*
+     * The routine returns (nSeed * nA) mod nM, where   nA (the 
+     * multiplier) is 16807, and nM (the modulus) is 
+     * 2147483647 = 2^31 - 1.
+     * 
+     * nM is prime and nA is a primitive element of the range 1..nM-1.  
+     * This * means that the map nSeed = (nSeed*nA) mod nM, starting 
+     * from any nSeed in 1..nM-1, runs through all elements of 1..nM-1 
+     * before repeating.  It never hits 0 or nM.
+     * 
+     * To compute (nSeed * nA) mod nM without overflow, use the 
+     * following trick.  Write nM as nQ * nA + nR, where nQ = nM / nA 
+     * and nR = nM % nA.   (For nM = 2147483647 and nA = 16807, 
+     * get nQ = 127773 and nR = 2836.) Write nSeed as nU * nQ + nV, 
+     * where nU = nSeed / nQ and nV = nSeed % nQ.  Then we have:
+     * 
+     * nM  =  nA * nQ  +  nR        nQ = nM / nA        nR < nA < nQ
+     * 
+     * nSeed = nU * nQ  +  nV       nU = nSeed / nQ     nV < nU
+     * 
+     * Since nA < nQ, we have nA*nQ < nM < nA*nQ + nA < nA*nQ + nQ, 
+     * i.e., nM/nQ = nA.  This gives bounds on nU and nV as well:   
+     * nM > nSeed  =>  nM/nQ * >= nSeed/nQ  =>  nA >= nU ( > nV ).
+     * 
+     * Using ~ to mean "congruent mod nM" this gives:
+     * 
+     * nA * nSeed  ~  nA * (nU*nQ + nV)
+     * 
+     * ~  nA*nU*nQ + nA*nV
+     * 
+     * ~  nU * (-nR)  +  nA*nV      (as nA*nQ ~ -nR)
+     * 
+     * Both products in the last sum can be computed without overflow   
+     * (i.e., both have absolute value < nM) since nU*nR < nA*nQ < nM, 
+     * and  nA*nV < nA*nQ < nM.  Since the two products have opposite 
+     * sign, their sum lies between -(nM-1) and +(nM-1).  If 
+     * non-negative, it is the answer (i.e., it's congruent to 
+     * nA*nSeed and lies between 0 and nM-1). Otherwise adding nM 
+     * yields a number still congruent to nA*nSeed, but now between 
+     * 0 and nM-1, so that's the answer.
+     */
+
+    long            nU, nV;
+
+    nU = nSeed / nQ;
+    nV = nSeed - nQ * nU;       /* i.e., nV = nSeed % nQ */
+    nSeed = nA * nV - nU * nR;
+    if (nSeed < 0)
+        nSeed += nM;
+    return (nSeed);
+}
+
+/******************************************************************
+
+   UnifInt:  Yields an long uniformly distributed between given bounds
+
+*******************************************************************/
+
+/*
+ * long UnifInt( long nLow, long nHigh, long nStream )
+ */
+long
+UnifInt(long nLow, long nHigh, long nStream)
+
+/*
+ * Returns an integer uniformly distributed between nLow and nHigh, 
+ * including * the endpoints.  nStream is the random number stream.   
+ * Stream 0 is used if nStream is not in the range 0..MAX_STREAM.
+ */
+
+{
+    double          dRange;
+    long            nTemp;
+
+    if (nStream < 0 || nStream > MAX_STREAM)
+        nStream = 0;
+
+    if (nLow > nHigh)
+    {
+        nTemp = nLow;
+        nLow = nHigh;
+        nHigh = nTemp;
+    }
+
+    dRange = DOUBLE_CAST (nHigh - nLow + 1);
+    Seed[nStream].value = NextRand(Seed[nStream].value);
+    nTemp = (long) (((double) Seed[nStream].value / dM) * (dRange));
+    return (nLow + nTemp);
+}
+
+
+
+/******************************************************************
+
+   UnifReal:  Yields a real uniformly distributed between given bounds
+
+*******************************************************************/
+
+/*
+ * double UnifReal( double dLow, double dHigh, long nStream )
+ */
+double
+UnifReal(double dLow, double dHigh, long nStream)
+
+/*
+ * Returns a double uniformly distributed between dLow and dHigh,   
+ * excluding the endpoints.  nStream is the random number stream.   
+ * Stream 0 is used if nStream is not in the range 0..MAX_STREAM.
+ */
+
+{
+    double          dTemp;
+
+    if (nStream < 0 || nStream > MAX_STREAM)
+        nStream = 0;
+    if (dLow == dHigh)
+        return (dLow);
+    if (dLow > dHigh)
+    {
+        dTemp = dLow;
+        dLow = dHigh;
+        dHigh = dTemp;
+    }
+    Seed[nStream].value = NextRand(Seed[nStream].value);
+    dTemp = ((double) Seed[nStream].value / dM) * (dHigh - dLow);
+    return (dLow + dTemp);
+}
+
+
+
+/******************************************************************%
+
+   Exponential:  Yields a real exponentially distributed with given mean
+
+*******************************************************************/
+
+/*
+ * double Exponential( double dMean, long nStream )
+ */
+double
+Exponential(double dMean, long nStream)
+
+/*
+ * Returns a double uniformly distributed with mean dMean.  
+ * 0.0 is returned iff dMean <= 0.0. nStream is the random number 
+ * stream. Stream 0 is used if nStream is not in the range 
+ * 0..MAX_STREAM.
+ */
+
+{
+    double          dTemp;
+
+    if (nStream < 0 || nStream > MAX_STREAM)
+        nStream = 0;
+    if (dMean <= 0.0)
+        return (0.0);
+
+    Seed[nStream].value = NextRand(Seed[nStream].value);
+    dTemp = (double) Seed[nStream].value / dM;        /* unif between 0..1 */
+    return (-dMean * log(1.0 - dTemp));
+}
diff --git a/data/ssb/dbgen/rnd.h b/data/ssb/dbgen/rnd.h
new file mode 100644
index 0000000..a8e8d36
--- /dev/null
+++ b/data/ssb/dbgen/rnd.h
@@ -0,0 +1,80 @@
+/*
+ * Sccsid:  @(#)rnd.h	2.1.8.1
+ * 
+ * rnd.h -- header file for use withthe portable random number generator
+ * provided by Frank Stephens of Unisys
+ */
+
+/* function protypes */
+long            NextRand    PROTO((long));
+long            UnifInt     PROTO((long, long, long));
+double          UnifReal    PROTO((double, double, long));
+double          Exponential PROTO((double, long));
+
+static long     nA = 16807;     /* the multiplier */
+static long     nM = 2147483647;/* the modulus == 2^31 - 1 */
+static long     nQ = 127773;    /* the quotient nM / nA */
+static long     nR = 2836;      /* the remainder nM % nA */
+
+static double   dM = 2147483647.0;
+
+/*
+ * macros to control RNG and assure reproducible multi-stream
+ * runs without the need for seed files. Keep track of invocations of RNG
+ * and always round-up to a known per-row boundary.
+ */
+/* 
+ * preferred solution, but not initializing correctly
+ */
+#define VSTR_MAX(len)	(long)(len / 5 + (len % 5 == 0)?0:1 + 1)
+seed_t     Seed[MAX_STREAM + 1] =
+{
+    {PART,   1,          0,	1},					/* P_MFG_SD     0 */
+    {PART,   46831694,   0, 1},					/* P_BRND_SD    1 */
+    {PART,   1841581359, 0, 1},					/* P_TYPE_SD    2 */
+    {PART,   1193163244, 0, 1},					/* P_SIZE_SD    3 */
+    {PART,   727633698,  0, 1},					/* P_CNTR_SD    4 */
+    {NONE,   933588178,  0, 1},					/* P_RCST_SD    5  UNUSED 2-4-98 */
+    {PART,   804159733,  0, RNG_PER_SENT * 3},	/* P_CMNT_SD    6 */
+    {PSUPP,  1671059989, 0, SUPP_PER_PART},     /* PS_QTY_SD    7 */
+    {PSUPP,  1051288424, 0, SUPP_PER_PART},     /* PS_SCST_SD   8 */
+    {PSUPP,  1961692154, 0, SUPP_PER_PART * RNG_PER_SENT * 20},     /* PS_CMNT_SD   9 */
+    {ORDER,  1227283347, 0, 1},				    /* O_SUPP_SD    10 */
+    {ORDER,  1171034773, 0, 1},					/* O_CLRK_SD    11 */
+    {ORDER,  276090261,  0, RNG_PER_SENT * 8},  /* O_CMNT_SD    12 */
+	{ORDER,  1066728069, 0, 1},					/* O_ODATE_SD   13 */
+    {LINE,   209208115,  0, O_LCNT_MAX},        /* L_QTY_SD     14 */
+    {LINE,   554590007,  0, O_LCNT_MAX},        /* L_DCNT_SD    15 */
+    {LINE,   721958466,  0, O_LCNT_MAX},        /* L_TAX_SD     16 */
+    {LINE,   1371272478, 0, O_LCNT_MAX},        /* L_SHIP_SD    17 */
+    {LINE,   675466456,  0, O_LCNT_MAX},        /* L_SMODE_SD   18 */
+    {LINE,   1808217256, 0, O_LCNT_MAX},      /* L_PKEY_SD    19 */
+    {LINE,   2095021727, 0, O_LCNT_MAX},      /* L_SKEY_SD    20 */
+    {LINE,   1769349045, 0, O_LCNT_MAX},      /* L_SDTE_SD    21 */
+    {LINE,   904914315,  0, O_LCNT_MAX},      /* L_CDTE_SD    22 */
+    {LINE,   373135028,  0, O_LCNT_MAX},      /* L_RDTE_SD    23 */
+    {LINE,   717419739,  0, O_LCNT_MAX},      /* L_RFLG_SD    24 */
+    {LINE,   1095462486, 0, O_LCNT_MAX * RNG_PER_SENT * 5},   /* L_CMNT_SD    25 */
+    {CUST,   881155353,  0, 9},      /* C_ADDR_SD    26 */
+    {CUST,   1489529863, 0, 1},      /* C_NTRG_SD    27 */
+    {CUST,   1521138112, 0, 3},      /* C_PHNE_SD    28 */
+    {CUST,   298370230,  0, 1},      /* C_ABAL_SD    29 */
+    {CUST,   1140279430, 0, 1},      /* C_MSEG_SD    30 */
+    {CUST,   1335826707, 0, RNG_PER_SENT * 12},     /* C_CMNT_SD    31 */
+    {SUPP,   706178559,  0, 9},      /* S_ADDR_SD    32 */
+    {SUPP,   110356601,  0, 1},      /* S_NTRG_SD    33 */
+    {SUPP,   884434366,  0, 3},      /* S_PHNE_SD    34 */
+    {SUPP,   962338209,  0, 1},      /* S_ABAL_SD    35 */
+    {SUPP,   1341315363, 0, RNG_PER_SENT * 11},     /* S_CMNT_SD    36 */
+    {PART,   709314158,  0, 92},      /* P_NAME_SD    37 */
+    {ORDER,  591449447,  0, 1},      /* O_PRIO_SD    38 */
+    {LINE,   431918286,  0, 1},      /* HVAR_SD      39 */
+    {ORDER,  851767375,  0, 1},      /* O_CKEY_SD    40 */
+    {NATION, 606179079,  0, RNG_PER_SENT * 16},      /* N_CMNT_SD    41 */
+    {REGION, 1500869201, 0, RNG_PER_SENT * 16},      /* R_CMNT_SD    42 */
+    {ORDER,  1434868289, 0, 1},      /* O_LCNT_SD    43 */
+    {SUPP,   263032577,  0, 1},      /* BBB offset   44 */
+    {SUPP,   753643799,  0, 1},      /* BBB type     45 */
+    {SUPP,   202794285,  0, 1},      /* BBB comment  46 */
+    {SUPP,   715851524,  0, 1}       /* BBB junk     47 */
+};
diff --git a/data/ssb/dbgen/rnd.o b/data/ssb/dbgen/rnd.o
new file mode 100644
index 0000000000000000000000000000000000000000..51f039b99d6f87cae74f2d5fdc726f5d2e2c9fba
GIT binary patch
literal 10608
zcmbtZeRNdSwZ9VvGDbRg@H2klP=cLE7!rs`5G0cia_2D#U;z0D#F$JVC10J~2}o<8
z9RuEmXnp!zU4313b-m}SEwxYA;#I3Yoe2q{R{Fqd8}J*gCLJR*pk#rP?tSjrGdtHI
z{o|drX3jak{dx8|d!LiZuF}A=G@DJKW>fA|k~2aTC3jYGTquTx$~48LoKdHr*SaSo
zqILf$It99ZLDnKsw^nd8eEJ}=_}2Q?_*So}jEqguZ;6bhKaV_W$*9(S6OMc@;dh|+
z)QWEXX~iG=*dAZ_iXEW%yOAYGQuK^KSE(a#N2zPq60Pfgmp1!;hh}^>h%{fM*QP}d
zx_yx^7wCS?ct`6h9t{)>>Ib#%tC8VfYFE!IL}%WNKT~3SAkmj`om!XBFZ>b8G%%wX
zrLOodH*WG(`8N4Bn+p)`YmrYKKI2QRdk(675x)ZFI6gIU5S-|9M=mVTXZmY<wSrP5
z9M`(a9ma`(@!2bAENLZvOpEk7ObbH4_QHJ^KD_zAC;!5y8Kc^{ziF}aDvn~KOSSAC
zt)b_pGd4xb?$;Xnf8hhS8(zQaa_F*;Le$5`#&&6{)?+#fQPzyFG~<jGIln+y)YRgm
zTEV%FDVp)QW?ax>CnpEG#eVv2Z}_{um*o&E_ZuZy4lO$lp|sVjr=la8dh0<L{2m8~
zN7T*V!1a!(>tg`qL^azW`Y()1ljk(!&zkWE&3M%erBMop!aN(X4<~Dpl1!Iv<0g`T
zSk`N5$H5VG+SDv9GL~?5v|jbbcBbi7KyHqYsIIK6dUfun?HS1aA+YCUX|Ycx%U#fH
zU%`PemjD~*AlIYdv9Jvare>k4Gb3u%ABF$wx<h9Cg)fz`tu9cD%OCH?sVEo*xNW^@
zP#jjxc0ud<E*UIFgVqqaq~WTV2t>W=f%v&VQN4P%B(0Bu2d<46;As3T=7{7HHeh_T
zUS0hskeY>2hlrePpD*GL0<Yta0Nv9wqV~k5B$-zC!V-b7<~S1T)wQ7Bb394+90R3q
zz+9YPQ2Mv_*{C#@Pznl9phC&8`kfJV-3d_m0VsPLlpQk-cwif<0Q0;G0<WckR#36#
z55N>NPvPB^@eo^&h^-4^o7Um<8yEe?nD2ivAL7+ucMPa-1i_|=f3Jt~&R31W`2xs#
zmLvWijH4rd_0~hMuduhS+swGkJ-z+7-xy9R3K*k->_IIuYU|j)yt`zHq8WW!Y<Q_=
z>(h>XCGZ0;qID2?3zS#kuE^64Y6Dp=zGrf`Hpd_i;Gb9toD#_Of_Luk=ROfNQz$np
z+60PAvNB;I=J^o&>bLch-opXobRheEl7>SFCk&PjhZh~8&6zu>?(MrcHdbEnaX>8}
z4yb{zhAzTf6x>9;f?^=j40c;AW~Vzm9Ta|qXXul9zuNH;d^LmZL_HE|2khbu?3|)L
zupgrZ2?YD=6-8j@Wq~enF7<<me+amkCh`5_l_dBdHFsW$Yn{U#-|<KKY=MGdb)OD}
zOspA}+9&L#+zE8|OW2N4bRGuDlhA%(D3^4=>iUE@aEf&!ckhr3=+gL*9mXP7{IF(<
z1A107{sbO$^!be==n{JKCe(r+b?-58bmoeSMcsQ{nn+G__CvhJ)Xv|+H`Tq!;-iWl
zqFGnYQNQsQD9pF1d*g5hB@Sy?eAG;X1g9M;yU?oLVLzchfSM%`=vsUNZdV)84Le*g
zqp*LOEOTA}e&uDnLSMm=3?ydLt*3EepC?RD%O8lRYh7Zq{l<G*<N`QXk#tUrEFMtQ
z2R0-TaQDUmGjo#e4v;(OZs#fpt)JD>0XU=T1Noq%<1#qh<;KT;qgOKqwAlM_7l|v_
zZ|nDW?V_uw%n>k72C_%ANRLgN3D{34XkV+ea2W;EoY8XIIq6EkSUeCg-Y+-KCc>3}
z!XG(ggM`AUm3Km-R1DHNJA^xXByr9N?t@Gd2@UxsF1ya?3*fw>sTDo6P>4G%zu!I}
zsODuCF!Be68Xys6REM6#u>stoW?7yBLyrL!z32-Vr-UQPt+5XbJqXlt;}clY&}=%d
zc*Vn&1$Q}A;KS%M9L?Bv4}6RCRQEmtz9y3Os_+#Fm9D9yaXZG5UI1P8x3%uAi5MSB
zaBjBa9zzgBOcLpY7AXQ<5P{C<?_^+N8briz<h%{%V_fX;p2L>{nvL(p<M{Q4I(W0r
zD}MYcWLmOW>2u3_o03%JYj4<33#OM%ME~ahZJwQq4j~uq^~NV}Pemu}^WC4#r7`xa
z<b?dZdc#j@IUO?-8_~Dc{_;)}F~6<!^Oq0noZjjf&-dtCxA-}o^pc)```6b>C!`<I
zU7S83CnVo_{;{Q;PWnY@1;YH|9aA|yT~3I8d$n`MMD(Hsj&bp~%1{6NMGC1<W2Fzg
zR^*vT|5N$>+a{tv(ed8*CZhk(&Fu{n(YyY7N+dV&Sp7Tj_>yaghP{>k*4dt0C!()y
zUhd=cXD696*(uj;>dfc#%jAUU>5YfiaXRfE(WAfn)do%{ociDP9-hYOR{1^Jx#EfF
zS6=tT3QjMP3Mi?^nywwUe)6B__YFP@e_JLS*(dtT!RN-UpZq5J&u^+KNJ=O3j_5Ow
zkBsv#n+!;y=4EB${Hu~U<;RvIg)>v>r}O{EeJ@{^icaz?PWR4BMVD!=q(3|(i_@Qx
z{!yG(g~IDo(Mi8=!=5#qPRpb9+a7*eorr$zv+<mX=yQVCjLUzketY`<aq%NM#mAa|
zL?@e{9D8Jp#@G{`_W9a}0=p)n4`2MEZ6f+$M&F)Pbc)aVjaRHqMbD@#EiG{uYrcxI
z(napJV6blEth!C!a9hompnHjXR$YcNt8UR{vl?NrVnva=vIMXhGcuC8qHw>w=QHQ0
zWsS`(o>P2d{N#e`mmaCT*|UGu`K69u-?*Xfjx$SlzWbBV!c~7?>RMND;Bfm%s>y|>
zyhHXb*<U4lWw#=myv~xbDl0Bt<o2w&H{7g;-FbQ5d~fcYys((&b<CUR&CSo2u+V_#
z2`>uVx3Y>g3dARh812JVFve!vd6#0_;j-N@HQn(e*esN}QBV|x5@jLGb0rqTewowt
zP+E~QvwKpJ)4k8`b9(kn_BnGSQ_`MxdVEedkcynHA}1sTCVQily$5@c?MB&slYns#
z%0?!aICImM3rQbIq96Z>{LuVnr{KjQekg%Hj2P(Jmj)txP*=pB*5J(aNxN<!e}p{H
zM8cLjU3-u>GU?k+mk(W2vf$&82Xr4YCV5Jn?zCM(8?d^8lq>o6O1?-Mvb1LiUB1gj
z+*V7T*N_LA2A{1G`#$#I@58wEhcIs4_9CZeAI5FZ6rXcpB)!CWPjiNIApj5*U`Rv}
z)GN3L2c)hXsVh|+w`GivBg&#5r={#Om?;0ukCkaJUdj>N&&4v?dSGnb1Ki5}8BRAu
z+T03sV1kBj*_FSKrDB<ss2)kGc^^9z+&|vTqGm-E`0W!LRj;IkdC8Bvss!-{@lCgW
zlHV<HtA$wtcZmH@ax02iMAN^9n~%?QC6hD5*t`_el|l=?NX*j}w*@x~O1juvPAZlB
z9uC5Ixx`mmaPzU2p2%+^-!1v^-0`+aOwyHg9E5SLm`_pYo+EMt46b1NNHqnE{Io<h
zg5o5%Vu$1>zL-?>edw_!SX&nWsqn`yf&cUp_<vjie-?1`pK8fHFvs>1d{Wu{J>bYs
z?<PMoLwg-Qh<D?GhbHP5^e5nhc$&;9dR+Wf;`tI6^$D=gBp#Kxs7HXEhfgYhE=vBg
zNicv$waE^CpxrC&CNAm|46gw^m7eP*|6?+hL=6IdkHl&JM127q>*X}aY<fUH19NQ4
zB_5S?!fzselT9Ifqr`Vh{|RrAc%k%%@E=G#Dif9PhXKd+w#ozABkd@m<~nb!;tdqf
z$FJM)D;K}=@M|7^<>S}w__YAP7UEZ);4TnDLC-UVgnXWm%@e|TLMHEaCDgLLx=pVM
z=}IL$KIF8eRcQ<A)%YA~Q!4bWrH#R+V6(31b;0^Jr471JNU5%_Zw)mx>-E*OTem57
z;igt8P+iUNir^0YuA1gLWleKKy&rVcwY61;0BC6v&|SfrMx}H|YfCd2Xn=8J%NFn=
z3=)+sjqSl@Eur#oqu$Wk7z`<`E!&l{pe}(1Kq|Kdx1-jIa8pS`o8Hh|tFNtT3<ulF
zg3ZBDjSfpHZfOiRH6x>-D8=DW2yBT-F&NT84aZj3gti3raS)ji>|%$g660~lnr+s3
zMJbZ|N4Px3vJNTu-VrtSf_zBrmi)+v5Bx7B{&R^V|Md)ij^TL3i2oqN@vb2Jh{RDp
zzMlvml{ooHIOZQV#If`dK40RfXF9{huUas->gi|vH#7cWrXOQS`ZG}hG}P~g58*Q<
zZq?s$3H*ml54ZaS<L7oyF&y`i>|VHp9+%8_By&BP4965FJu@VZ>&jyIe8!J7;;&>l
zmIT6U8IF&C!XIQfxBC>sxt`}4&h;E;IM?%CbPgJ>7hg<d_f3go_brBh!1%cy^M42C
z{&Iewtdl4<i|J{XxYh1&7>+KJKkqP{+Z|yz<|y%BU^xEfNqC05@6oQ4;a4&o|Ij7=
zZ!#S3C&Fhl9N&6`FJd_Nvs&WTcy7A{e<#EFzWj{o$zpc@i}7E<@aGuL<9VLpS2BJF
z9(rgPACBL^aC~FZdb1fmo8dl&=P<mI;d2<inc-fBw=f*vuw=J`;atyC62~~Z8UIU+
zpZobL!+Ct(WjJ5&X@>K4rQv2n!?<yOzQJ(5u4N48b?GjNqyN0V?PUDu0>z=5;XFP+
zV>tJ}pXvV=)AI%6=YHDcyT!UL7sI)JH^X^6=Q13B10etJlsK-J-#aTA?qU3QGd(>1
zk1(9)=@Sg+=lNF*=jY)ihVym3!EkPOSmNkEkLM`k=kas{A2y69eyEv$hb8fulF%x9
z;{Qm_pR(Ze?#I6mun|AK`v*-pfz!KxD!Z>yhTi@7uQ+U^CyE1T>nu3E?;9;Rz3-p5
z;Pj3^Yr*LqUnuR6U3xbw-nRRi^qPC2*F$E%l}4dpV~tmbcc|j6tI=x|@4anpiZ|4P
z6J*cx!rPfo;hij}wQYKm1`~WX*S0i?$FR4#MGt!6N31#Uey-i7nDZ^oVQ*`wr8OAR
z?*lbxI=HpEK2*~bG$Zu?`*wpkhDMg>gt9_8nwOD)#6X5I0pi;b&thQ$F$bgg$b_~|
z=D8EbC@ZIl-vw@wIOZ;IyQN*)C!)D&AbZRW)M=G3G=C6LS}p#&r5ts6FhIZP?@_`@
zp7Mfl@}J^R2!vGfwbDM;THZ*G`V~OLG2WrP?UwTQa1h31pZY4mQpvwB<=qx}`kR;f
zZ%g@H3m-Xz@#T&DrXGLapfTsmu`Wmwt-KMy@qUqMm{5@`<=qLy9MHN6{&yIlPOJP%
zDc@=#SmhrF*;MCurQG<1(qTNmQ{hAMnJ~td$~NLrymX-b7?nfAH{R9ovC3oFww_&p
gly&mIGe5Ni%raGQn^Va@n1xyt<w>(kd|T!J0WSrE`Tzg`

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/shared.h b/data/ssb/dbgen/shared.h
new file mode 100644
index 0000000..c1c18ce
--- /dev/null
+++ b/data/ssb/dbgen/shared.h
@@ -0,0 +1,140 @@
+/*
+ *  Sccsid:     @(#)shared.h	2.1.8.1
+ *  Modified for SSBM
+ */
+#define N_CMNT_LEN      72
+#define N_CMNT_MAX      152
+#define R_CMNT_LEN      72
+#define R_CMNT_MAX      152
+#define  MONEY_SCL     0.01
+#define  V_STR_HGH    1.6
+
+#ifdef SSBM
+#define  P_NAME_LEN    22
+#define  P_MFG_LEN     6
+#define  P_COLOR_LEN   3
+#define  P_COLOR_MAX   11
+#define  P_TYPE_MAX    25
+#define  P_CAT_LEN     7
+#define  P_CAT_MIN     1
+#define  P_CAT_MAX     5
+#define  P_CAT_SD      97
+#define  S_NATION_NAME_LEN 15
+#define  S_REGION_NAME_LEN 12
+#define  C_NATION_NAME_LEN 15
+#define  C_REGION_NAME_LEN 12
+#define  C_NAT_SD     16
+#define  C_REG_SD      3
+#define  O_SHIP_STRU_LEN 25
+#define  O_SHIP_MODE_LEN 10
+#define  O_SHIP_PRIO_LEN 1
+#define  D_DATE_LEN 18
+#define  D_DAYWEEK_LEN 9
+#define  D_YEARMONTH_LEN 7
+#define  D_SEASON_LEN 12
+#define  D_MONTH_LEN  9
+#define  D_STARTDATE 694245661 /*corresponding to 1/1/1992 1:1:1*/
+#define NAMTION_BRIEF_LEN  9
+#define CITY_CODE_SEED   15  
+#define NUM_DAYS 2556
+#define NUM_SEASONS 5
+#define NUM_HOLIDAYS 10
+#define CITY_FIX 10
+#else
+
+#define  P_NAME_LEN    55
+#define  P_MFG_LEN     25
+
+#endif
+
+#define  P_BRND_LEN    10
+
+#ifdef SSBM
+#define  P_TYPE_LEN    12
+
+#else
+
+#define  P_TYPE_LEN    25
+
+#endif
+
+#define  P_CNTR_LEN    10
+#define  P_CMNT_LEN    14
+#define  P_CMNT_MAX    23
+#define  P_CAT_SEED    25
+
+#define  S_NAME_LEN    25
+
+#ifdef SSBM
+#define  S_ADDR_LEN    15
+#define  S_ADDR_MAX    25
+#else
+
+#define  S_ADDR_LEN    25
+#define  S_ADDR_MAX    40
+#endif
+
+#define  S_CMNT_LEN    63
+#define  S_CMNT_MAX   101
+#define  PS_CMNT_LEN  124
+#define  PS_CMNT_MAX  199
+
+#ifdef SSBM
+#define  C_NAME_LEN    25
+#define  C_MSEG_MIN    1
+#define  C_MSEG_MAX    5
+#define  C_ADDR_LEN    15
+#define  C_ADDR_MAX    25
+#else
+#define  C_NAME_LEN    18
+#define  C_ADDR_LEN    25
+#define  C_ADDR_MAX    40
+#endif
+
+#define  C_MSEG_LEN    10
+#define  C_CMNT_LEN    73
+#define  C_CMNT_MAX    117
+
+#ifdef SSBM
+#define  O_OPRIO_LEN   8
+
+#else
+#define  O_OPRIO_LEN   15
+
+#endif
+
+#define  O_CLRK_LEN    15
+#define  O_CMNT_LEN    49
+#define  O_CMNT_MAX    79
+#define  L_CMNT_LEN    27
+#define  L_CMNT_MAX    44
+#define  L_INST_LEN    25
+#define  L_SMODE_LEN   10
+#define  T_ALPHA_LEN   10
+#define  DATE_LEN      13  /* long enough to hold either date format */
+#define  NATION_LEN    25
+#define  REGION_LEN    25
+#define  PHONE_LEN     15
+
+#ifdef SSBM
+#define  MAXAGG_LEN    10    /* max component length for a agg str */
+
+#else
+#define  MAXAGG_LEN    20    /* max component length for a agg str */
+
+#endif
+
+#define  P_CMNT_SD      6
+#define  PS_CMNT_SD     9
+#define  O_CMNT_SD     12
+#define  C_ADDR_SD     26
+#define  C_CMNT_SD     31
+#define  S_ADDR_SD     32
+#define  S_CMNT_SD     36
+#define  L_CMNT_SD     25
+
+
+
+
+
+
diff --git a/data/ssb/dbgen/speed_seed.c b/data/ssb/dbgen/speed_seed.c
new file mode 100644
index 0000000..402b7de
--- /dev/null
+++ b/data/ssb/dbgen/speed_seed.c
@@ -0,0 +1,325 @@
+/* @(#)speed_seed.c	2.1.8.2 */
+#include <stdio.h>
+#include <stdlib.h>
+#include "dss.h"
+
+/*  _tal long RandSeed = "Random^SeedFromTimestamp" (void); */
+
+#define FAKE_V_STR(avg, sd, cnt) \
+	ADVANCE_STREAM(sd, \
+		(long)(Seed[sd].boundary*cnt))
+#define ADVANCE_STREAM(stream_id, num_calls) \
+        NthElement(num_calls, &Seed[stream_id].value)
+
+#define MAX_COLOR 92
+long name_bits[MAX_COLOR / BITS_PER_LONG];
+extern seed_t Seed[];
+
+/* WARNING!  This routine assumes the existence of 64-bit                 */
+/* integers.  The notation used here- "HUGE" is *not* ANSI standard. */
+/* Hopefully, you have this extension as well.  If not, use whatever      */
+/* nonstandard trick you need to in order to get 64 bit integers.         */
+/* The book says that this will work if MAXINT for the type you choose    */
+/* is at least 2**46  - 1, so 64 bits is more than you *really* need      */
+
+static DSS_HUGE Multiplier = 16807;      /* or whatever nonstandard */
+static DSS_HUGE Modulus =  2147483647;   /* trick you use to get 64 bit int */
+
+/* Advances value of Seed after N applications of the random number generator
+   with multiplier Mult and given Modulus.
+   NthElement(Seed[],count);
+
+   Theory:  We are using a generator of the form
+        X_n = [Mult * X_(n-1)]  mod Modulus.    It turns out that
+        X_n = [(Mult ** n) X_0] mod Modulus.
+   This can be computed using a divide-and-conquer technique, see
+   the code below.
+
+   In words, this means that if you want the value of the Seed after n
+   applications of the generator,  you multiply the initial value of the
+   Seed by the "super multiplier" which is the basic multiplier raised
+   to the nth power, and then take mod Modulus.
+*/
+
+/* Nth Element of sequence starting with StartSeed */
+/* Warning, needs 64-bit integers */
+#ifdef SUPPORT_64BITS
+void NthElement (long N, long *StartSeed)
+   {
+   DSS_HUGE Z;
+   DSS_HUGE Mult;
+   static int ln=-1;
+   int i;
+
+   if ((verbose > 0) && ++ln % 1000 == 0)
+       {
+       i = ln % LN_CNT;
+       fprintf(stderr, "%c\b", lnoise[i]);
+       }
+   Mult = Multiplier;
+   Z = (DSS_HUGE) *StartSeed;
+   while (N > 0 )
+      {
+      if (N % 2 != 0)    /* testing for oddness, this seems portable */
+         Z = (Mult * Z) % Modulus;
+      N = N / 2;         /* integer division, truncates */
+      Mult = (Mult * Mult) % Modulus;
+      }
+   *StartSeed = (long)Z;
+
+   return;
+   }
+#else
+/* add 32 bit version of NthElement HERE */
+/*
+ *    MODMULT.C
+ *    R. M. Shelton -- Unisys
+ *    July 26, 1995
+ *
+ *    RND_seed:  Computes the nth seed in the total sequence
+ *    RND_shift:  Shifts a random number by a given number of seeds
+ *    RND_ModMult:  Multiplies two numbers mod (2^31 - 1)
+ *
+ */
+
+
+
+#include <math.h>
+#include <stdio.h>       /* required only for F_FatalError */
+
+typedef signed long RND;
+typedef unsigned long URND;
+
+#define FatalError(e)  F_FatalError( (e), __FILE__, __LINE__ )
+void F_FatalError( int x, char *y, int z ) {fprintf(stderr, "Bang!\n");}
+
+
+/* Prototypes */
+RND RND_seed( RND );
+RND RND_shift( RND, RND );
+static RND RND_ModMult( RND, RND );
+
+
+
+RND 
+RND_seed ( RND Order            )
+{
+static const RND TopMask = 0x40000000;
+RND Mask;
+RND Result;
+
+
+if (Order <= -Modulus || Order >= Modulus)
+   FatalError(1023);
+
+if (Order < 0) Order = Modulus - 1L + Order;
+
+Mask = TopMask;
+Result = 1L;
+
+while (Mask > Order) Mask >>= 1;
+
+while (Mask > 0)
+   {
+   if (Mask & Order)
+      {
+      Result = RND_ModMult( Result, Result);
+      Result = RND_ModMult( Result, Multiplier );
+      }
+   else
+      {
+      Result = RND_ModMult( Result, Result );
+      }
+   Mask >>= 1;
+   }
+
+return (Result);
+
+}  /*  RND_seed  */
+
+
+
+/***********************************************************************
+
+    RND_shift:  Shifts a random number by a given number of seeds
+
+***********************************************************************/
+
+void 
+NthElement ( long Shift, long *Seed)
+
+{
+   RND Power;
+   static int ln=-1;
+   int i;
+
+   if ((verbose > 0) && ++ln % 100 == 0)
+       {
+       i = (ln/100) % LN_CNT;
+       fprintf(stderr, "%c\b", lnoise[i]);
+       }
+
+
+if (*Seed <= 0 || *Seed >= Modulus)
+   FatalError(1023);
+if (Shift <= -Modulus || Shift >= Modulus)
+   FatalError(1023);
+
+Power = RND_seed( Shift );
+
+*Seed = RND_ModMult( *Seed, Power );
+
+return;
+}  /*  RND_shift  */
+
+
+
+/*********************************************************************
+
+    RND_ModMult:  Multiplies two numbers mod (2^31 - 1)
+
+*********************************************************************/
+
+static RND 
+RND_ModMult ( RND nA, RND nB)
+
+{
+
+static const double dTwoPowPlus31 = 2147483648.;
+static const double dTwoPowMinus31 = 1./2147483648.;
+static const double dTwoPowPlus15 = 32768.;
+static const double dTwoPowMinus15 = 1./32768.;
+static const RND    nLowMask = 0xFFFFL;
+static const URND   ulBit31 = 1uL << 31;
+
+double dAH, dAL, dX, dY, dZ, dW;
+RND    nH, nL;
+URND   ulP, ulQ, ulResult;
+
+nL = nB & nLowMask;
+nH = (nB - nL) >> 16;
+dAH = (double)nA * (double)nH;
+dAL = (double)nA * (double)nL;
+dX = floor( dAH * dTwoPowMinus15 );
+dY = dAH - dX*dTwoPowPlus15;
+dZ = floor( dAL * dTwoPowMinus31 );
+dW = dAL - dZ*dTwoPowPlus31;
+
+ulQ = (URND)dW + ((URND)dY << 16);
+ulP = (URND)dX + (URND)dZ;
+if (ulQ & ulBit31) { ulQ -= ulBit31; ulP++; }
+
+ulResult = ulP + ulQ;
+if (ulResult & ulBit31) { ulResult -= ulBit31; ulResult++; }
+
+return (RND)ulResult;
+}
+#endif /* SUPPORT_64BITS */
+
+/* updates Seed[column] using the a_rnd algorithm */
+void
+fake_a_rnd(int min, int max, int column)
+{
+   long len, itcount;
+   RANDOM(len, (long)min, (long)max, (long)column);
+   if (len % 5L == 0)
+      itcount = len/5;
+   else itcount = len/5 + 1L;
+   NthElement(itcount, &Seed[column].usage);
+   return;
+}
+
+
+long 
+sd_part(int child, long skip_count)
+{
+   int i;
+ 
+   for (i=P_MFG_SD; i<= P_CNTR_SD; i++)
+       ADVANCE_STREAM(i, skip_count);
+ 
+   FAKE_V_STR(P_CMNT_LEN, P_CMNT_SD, skip_count);
+   ADVANCE_STREAM(P_NAME_SD, skip_count * 92);
+
+   return(0L);
+}
+
+long 
+sd_line(int child, long skip_count)
+	{
+	int i,j;
+	
+	for (j=0; j < O_LCNT_MAX; j++)
+		{
+		for (i=L_QTY_SD; i<= L_RFLG_SD; i++)
+			ADVANCE_STREAM(i, skip_count);
+		}
+	
+	FAKE_V_STR(L_CMNT_LEN, L_CMNT_SD, skip_count);
+	/* need to special case this as the link between master and detail */
+	if (child == 1)
+		{
+		ADVANCE_STREAM(O_ODATE_SD, skip_count);
+		ADVANCE_STREAM(O_LCNT_SD, skip_count);
+		}
+		
+	return(0L);
+	}
+
+long 
+sd_order(int child, long skip_count)        
+{
+   ADVANCE_STREAM(O_LCNT_SD, skip_count);
+   ADVANCE_STREAM(O_CKEY_SD, skip_count);
+   FAKE_V_STR(O_CMNT_LEN, O_CMNT_SD, skip_count);
+   ADVANCE_STREAM(O_SUPP_SD, skip_count);
+   ADVANCE_STREAM(O_CLRK_SD, skip_count);
+   ADVANCE_STREAM(O_PRIO_SD, skip_count);
+   ADVANCE_STREAM(O_ODATE_SD, skip_count);
+
+   return (0L);
+}
+
+long
+sd_psupp(int child, long skip_count)
+	{
+	int j;
+	
+	for (j=0; j < SUPP_PER_PART; j++)
+		{
+		ADVANCE_STREAM(PS_QTY_SD, skip_count);
+		ADVANCE_STREAM(PS_SCST_SD, skip_count);
+		}
+	FAKE_V_STR(PS_CMNT_LEN, PS_CMNT_SD, skip_count);
+
+	return(0L);
+	}
+
+long 
+sd_cust(int child, long skip_count)
+{
+   
+   FAKE_V_STR(C_ADDR_LEN, C_ADDR_SD, skip_count);
+   FAKE_V_STR(C_CMNT_LEN, C_CMNT_SD, skip_count);
+   ADVANCE_STREAM(C_NTRG_SD, skip_count);
+   ADVANCE_STREAM(C_PHNE_SD, 3L * skip_count);
+   ADVANCE_STREAM(C_ABAL_SD, skip_count);
+   ADVANCE_STREAM(C_MSEG_SD, skip_count);
+   return(0L);
+}
+
+long
+sd_supp(int child, long skip_count)
+{
+   ADVANCE_STREAM(S_NTRG_SD, skip_count);
+   ADVANCE_STREAM(S_PHNE_SD, 3L * skip_count);
+   ADVANCE_STREAM(S_ABAL_SD, skip_count);
+   FAKE_V_STR(S_ADDR_LEN, S_ADDR_SD, skip_count);
+   FAKE_V_STR(S_CMNT_LEN, S_CMNT_SD, skip_count);
+   ADVANCE_STREAM(BBB_CMNT_SD, skip_count);
+   ADVANCE_STREAM(BBB_JNK_SD, skip_count);
+   ADVANCE_STREAM(BBB_OFFSET_SD, skip_count);
+   ADVANCE_STREAM(BBB_TYPE_SD, skip_count);      /* avoid one trudge */
+   
+   return(0L);
+}
diff --git a/data/ssb/dbgen/speed_seed.o b/data/ssb/dbgen/speed_seed.o
new file mode 100644
index 0000000000000000000000000000000000000000..41df59e924c934a4e0a10e572b2d3a5103f1d3d8
GIT binary patch
literal 7776
zcmbtYe{j^r72mrAk^;RuR20<0bxN3ka3mOtq1xm^z-1<6C=hFh;1Mo&A(L>)+<k-m
z5~4?Fk8rJ87;T5CTItwM9qB(+Ckp0Qky6_x2t%zRC_g3|s5Ynsgk0a-@7u5}x7JSI
z%=f$dKKps^?Y{5Z?|t{S%r9HubU2t42m3y|rb(1B@9o$6#iCox#xf6czj7e#{^alg
zcTYRUJBB=<{27(M4Y+@F2$XhI+Ck|IyK_H)HV2KN|K5PxGlTj^!tP~9L7#2wzJaxO
zgW2?M>A45^{~U04zBlq3Y1#V&?#j2r?uv6b7<NB!9NInzAe|}>bI(A4vtWB41-N1d
z_!9&;jT1ZxwnreqQ4Bz&Wyc`EhY;X%aDEmEaGjDXfU&p2?wQ|2hp`yWwwHH&2+Pks
z1QY=Sr2{A<1f>Qk?c4!rnE5S%)FRRa>NufBP(8#Q2QPMl-fWHLkBSbDw>!9_hbJG0
z$I|>e9OLf!on7GFjv_+4+b$P<v@w-R0W1RA;oNP5Mg7|W6Cp)FF0h{fCgR#r-EEhO
z`r{Ye6EE}lxoqAlJhzR6VEeHccv9q~54w!UyR*BHS+P`b6IXfisi9PA%d1x)JTL5A
zduv-r*ode3Mk<3b+c4RJG~RNayLlgf^eWJcwoG^pxW&BpOI~<<(;#nocXRO@7_$F%
zG0;E26GMGZ0hcG<L>3Sx^OoaQAL3hVeE$o6h!M4w{&$2uPYmKrm#@&~nz+E@Vliy<
zU9BQh$;FO@NCgje^4j)EE&V^>9fLWE-i}L?rXH+ouP*GgYI$o0kM}wFeGp^|tni8S
zE8L|e;^%2;iBIf=Bo6S_@Ab7o-gpAsi`ia3iKz#F(ed#_M^F4@#?&7EG*60&y7!N9
zQrCVwtzt#5LNGhkA1#91;xk&Bc$)_g^WYaen4*{+ysai|3%~d=@4d_u2VuQMd}=38
zW<JRaJDTPw6c_|n;u!Bu@s3Nt`a2K4%R5eu;7O4^p0vh$gzM#9H<*EQVRMKlH#qta
z;Eepspl%YM^Q5TAvgGs6Vu}*)bD5NQm$$6&RMR;CxO=Zy7*C2)vKLTywNll^qL;7m
zSBmHCf52aX5&VfE2;$$D1<uQoIdF#>77^2h^2g6Rnobx3`r|_m%QYOvzrz1e7foPz
z&i4GI+J9K2o<ph0ffM%JH8pV}?Fu#4|DxK{v{NAxVQ<k3`!Yo$W?&)~B}3k_#J_mL
zo>W=ltn6YM&cKu?Q73N5vt6V9O256U;6YyCiHm6$@io_W?{YQBSLaJvoQSL^=Bs@f
zCS-th-=Y9zp1#&JA=9u4QzccAns1L{vBprSIuL`KFUWF&*=*Cy8I|}nNQ&$>Z=T(L
zN;>=)3prU#EsdXq7oIH`17wTT+5Pk8&GqJ1tZIr_P2T)`Ux6=gMt+k}^Ec0$<;yFW
zO0Y1&!tlU_oB96o3g%?29Wi)o3%P|_$A%@$vCiYTV@y^y{t}_hoiz3qzOX@%n<sjV
zO~aPEJXNl2HW|BwzliO8muH*P?;0P^D0O+A&$!0>T^_$HI~Rr;i8pv7@3~Qu=pQX&
z?<Ve(#Em<h^`l(bFv=MEdY#4}!ghhnvjbyp%kaCr@l59u;j9Fv$L@vUKH|0#_ovPh
z*Z5>ciOahq)9=dNHp1`9i;r~vaFi<->|w0L<pE<6YdqNGQVyEY9@YnQ*?_(aKrN-Q
zAdbO2lIVK}&SM+;j*lpH<vGiRW0;`r7=3OcyA5b(`?Sy}jy+9dFI*dY4#!M_^i1~e
zRJ!quv|oH!1F{OPS-GI=RQU4Q<L9A6;bRS<OoGe7@JqrLiKCXG;55vphz`EDua~=o
zdgB7^ddp(^h}f_!OVp|AX|7PRL=IKl-f&sWtAfxk6?&GaCv|8c;j-T4Sh?tkpGFCZ
zb}8Zd${r+K?lWnGHG30ePUctaC&239@<@`>Mh*CSz|qeaR4-!x0rnW-zlIHuGH5p}
z32i$6yA1gAfTRCkQmKf&2-r&o{>ug&zxL42bSe<BX94RXd^_P{e**Rw!pBjnn?Xl8
z0vGz(MR-u4j2$O@kZ`f506Rl?luA_WC&13bH5PV^l;NCQ_a<LbUK&``P`#+B-eUC;
zUmn~VYGWbhE1Q>xw^?}0$J^a_E5O@qynPpMbMST#-U{)SpT`yi7F1bP_4A|AhA4|!
z)uCvVg&&UAS|Ns^@b`n2TdU{Sht`B57F!#Nu4;gAfk3!1S{t##f#B*o7Otub1*!tk
zNHwd7)P@%VaVZ+cssoKxQ48Dp+DHi7hA4y(1F@#YM(hNeVuB&`NY$E9U{$RZV{8^B
z0?P&W8h-4f&<_;;Zm@4CxV;bXUBTEW1qW9`5FZVf^n+i@D2R`NOXANU2%`JnYr{AA
z$LEit`~S>_1&*g1=^si(2=ejWCh^-8+^yi>QE-(%+kne=zwZAL1AnIh|E&Rk-GCoc
zaD2DOb@eGY*0jXW6OI)>PQmZQiGyG>7B0!3LOAkoR`5S6eEgmI(k_$Keg(%@h4k~L
z0UuIuHEuS&XY{x^grk|tpK0LFQSe)p`7|ni@LetExlzGyQt%%uxSHoD6#o+x{_hq1
z4h6rW_`&Zg8FwVTg0Q(w!RHc=>qS1Ptjxe)sqy7LsnPIm@P$%u@bj?7-$VK)1OGP~
ze=@D}MGfz!`g_ITr&r?_ke`2Oxb%PA;Ac?d@1eNj0~pqg!)l(h==VbAxter0wL0Hx
z;O83n_TNMH{u0lTcykQ=63xGSUo6z{t+cLkgP#hGFW*xu41Brn;XIK4dILYE`R}H@
zZ#3{9)A;haY&Y;<()g#bXhHTF_y-O6QO!>Qt?Pt_PbU1d!A}-_Kgj$nqxy7dcq!pG
z6Ry|KM2-I$@pCl%G~v?>e)2VbHSr4#{8Ej-mG}z{{KXo75Ah!~@B<pZoA@;b{v#TH
zGSyFuf&aM1FD3pi1An*1uO|K;1Ani^zlHcOYj`H%uNnLt(D*xv|5pv)O88-epLaBV
zH`V0_27aH$-%9*X4g8B5U)Fufz#mBm8(IH*$iItlT#ou)_2Om)K|Wqm3aBG-dHyNW
zaCx4wG+dr%p4D)9K55r*dEPju;qtsOo%X$)k34U14VUMQW(}9;jolh9&l}7aTffGt
zS_Q2YwcFLQ7Y)@{`K-`7i}|YISipR%Vln26HlTv^d><SwY*n_tV9bIO1#v*hhm%II
zVU0L0_#zEf$Or%O%!pZ4!8&H^HIXJ?W3-_$6t&iaCB_V`4uqrd?N2%Re;O|s<I<^Y
zhFuj?Zvxea9H2BwmEkuM)=u#-Vs~=!;~Qi#9xgyoMO*nl0DgO-Oo0n=dfgx-q*AF%
ziD66IZWx*lJ@l#DuO$27b0JyidOqM7U!I?c#-zQRr^My@Wm^mb*jM)ma@QH=FYS<n
z!r?*;zgbmjCi{C-5c)EIvaJ9toqhX%S-;FfknHocI6Y*K`Bg>EU$*ryDElh!3fZd|
z_I2(iz>uovPcGC8W2(q?OKc}}(5G(iCE_2nfV%z9z&2g|uA>K^P4-y7W8jkZvM$oa
z1{&+Z!#2oDDoi}1;yFyWN19%{0I@tg1cO|(o8h~*9C4OPXWxv68jxSv&G22f{|}G7
B4GsVR

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/tags b/data/ssb/dbgen/tags
new file mode 100644
index 0000000..8a9376c
--- /dev/null
+++ b/data/ssb/dbgen/tags
@@ -0,0 +1,1078 @@
+!_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
+!_TAG_FILE_SORTED	1	/0=unsorted, 1=sorted, 2=foldcase/
+!_TAG_PROGRAM_AUTHOR	Darren Hiebert	/dhiebert@users.sourceforge.net/
+!_TAG_PROGRAM_NAME	Exuberant Ctags	//
+!_TAG_PROGRAM_URL	http://ctags.sourceforge.net	/official site/
+!_TAG_PROGRAM_VERSION	5.8	//
+101	history.html	/^<li><A NAME="101">Changes as of 06\/04\/99<\/A><ul>$/;"	a
+131old	history.html	/^<\/ul><li><A NAME="131old">Changes as of 2\/6\/98<\/A><UL>$/;"	a
+19991011	history.html	/^<li><A NAME="19991011">Changes as of 10\/11\/99<\/A><ul>$/;"	a
+20000511	history.html	/^<li><A NAME="20000511">Changes as of 5\/11\/00<\/A><ul>$/;"	a
+2001old	history.html	/^<li><A NAME="2001old">Changes as of 3\/11\/98<\/A><UL>$/;"	a
+2002	history.html	/^<li><a name="2002">Changes as of 3\/20\/98<\/a><ul>$/;"	a
+2003	history.html	/^<li><a name="2003">Changes as of 3\/24\/98<\/a><ul>$/;"	a
+2004	history.html	/^<li><a name="2004">Changes as of 7 April 98<\/a><ul>$/;"	a
+2006	history.html	/^<li><a name="2006">Changes as of 5\/20\/98<\/a><ul>$/;"	a
+2007old	history.html	/^<li><A NAME="2007old">Changes as of 10\/23\/98<\/A><ul>$/;"	a
+2008old	history.html	/^<li><A NAME="2008old">Changes as of 11\/17\/98<\/A><ul>$/;"	a
+200old	history.html	/^<li><A NAME="200old">Changes as of 12\/08\/98<\/A><ul>$/;"	a
+201old	history.html	/^<li><A NAME="201old">Changes as of 01\/05\/99<ul>$/;"	a
+990708	history.html	/^<li><a name="990708">Changes as of 07\/08\/99<\/a><ul>$/;"	a
+990816	history.html	/^<li><A NAME="990816">Changes as of 08\/16\/99<\/A><ul>$/;"	a
+990830	history.html	/^<li><A NAME="990830">Changes as of 08\/30\/99<\/A><ul>$/;"	a
+ADD_AT_END	dss.h	71;"	d
+ADHOC_DFLT	dss.h	487;"	d
+ADHOC_TAG	dss.h	486;"	d
+ADVANCE_STREAM	speed_seed.c	11;"	d	file:
+ALLSRC	makefile	/^ALLSRC=$(DBGENSRC) $/;"	m
+ANS	makefile	/^ANS   = 1.ans 2.ans 3.ans 4.ans 5.ans 6.ans 7.ans 8.ans 9.ans 10.ans 11.ans \\$/;"	m
+ANSI	tpcd.h	18;"	d
+BBB_BASE	dss.h	340;"	d
+BBB_BASE_LEN	dss.h	344;"	d
+BBB_CMNT_LEN	dss.h	343;"	d
+BBB_CMNT_SD	dss.h	590;"	d
+BBB_COMMEND	dss.h	342;"	d
+BBB_COMPLAIN	dss.h	341;"	d
+BBB_DEADBEATS	dss.h	339;"	d
+BBB_JNK_SD	dss.h	588;"	d
+BBB_OFFSET_SD	dss.h	591;"	d
+BBB_TYPE_LEN	dss.h	345;"	d
+BBB_TYPE_SD	dss.h	589;"	d
+BITS_PER_LONG	dss.h	112;"	d
+CC	makefile	/^CC      = gcc$/;"	m
+CFLAGS	makefile	/^CFLAGS	= -O -DDBNAME=\\"dss\\" -D$(MACHINE) -D$(DATABASE) -D$(WORKLOAD)$/;"	m
+CITY_CODE_SEED	shared.h	38;"	d
+CITY_FIX	shared.h	42;"	d
+COMMENT	tpcd.h	20;"	d
+CONFIG_DFLT	dss.h	485;"	d
+CONFIG_TAG	dss.h	484;"	d
+CURRENTDATE	dss.h	426;"	d
+CUST	dss.h	61;"	d
+CUST_MORTALITY	dss.h	422;"	d
+C_ABAL_MAX	dss.h	365;"	d
+C_ABAL_MIN	dss.h	364;"	d
+C_ABAL_SD	dss.h	576;"	d
+C_ADDR_LEN	shared.h	86;"	d
+C_ADDR_LEN	shared.h	90;"	d
+C_ADDR_MAX	shared.h	87;"	d
+C_ADDR_MAX	shared.h	91;"	d
+C_ADDR_SD	shared.h	130;"	d
+C_CMNT_LEN	shared.h	95;"	d
+C_CMNT_MAX	shared.h	96;"	d
+C_CMNT_SD	shared.h	131;"	d
+C_DATES	dss.h	46;"	d
+C_MSEG_LEN	shared.h	94;"	d
+C_MSEG_MAX	dss.h	363;"	d
+C_MSEG_MAX	shared.h	85;"	d
+C_MSEG_MIN	shared.h	84;"	d
+C_MSEG_SD	dss.h	577;"	d
+C_NAME_FMT	dss.h	362;"	d
+C_NAME_LEN	shared.h	83;"	d
+C_NAME_LEN	shared.h	89;"	d
+C_NAME_TAG	dss.h	361;"	d
+C_NATION_NAME_LEN	shared.h	24;"	d
+C_NAT_SD	shared.h	26;"	d
+C_NTRG_SD	dss.h	574;"	d
+C_PHNE_SD	dss.h	575;"	d
+C_REGION_NAME_LEN	shared.h	25;"	d
+C_REG_SD	shared.h	27;"	d
+C_SIZE	dss.h	360;"	d
+DATABASE	makefile	/^DATABASE=DB2 $/;"	m
+DATE	dss.h	20;"	d
+DATE_LEN	shared.h	114;"	d
+DBASE	tpcd.h	12;"	d
+DBGENSRC	makefile	/^DBGENSRC=$(SRC1) $(HDR1) $(OTHER) $(DOC) $(SRC2) $(HDR2) $(SRC3)$/;"	m
+DBNAME	config.h	164;"	d
+DDL	makefile	/^DDL  = dss.ddl dss.ri$/;"	m
+DECLARER	driver.c	4;"	d	file:
+DECLARER	permute.c	8;"	d	file:
+DECLARER	qgen.c	6;"	d	file:
+DECLARER	text.c	10;"	d	file:
+DFLT	tpcd.h	9;"	d
+DFLT_NUM	tpcd.h	23;"	d
+DIGITS_PER_LONG	bcd2.c	18;"	d	file:
+DIST_DFLT	dss.h	481;"	d
+DIST_MEMBER	dss.h	157;"	d
+DIST_SIZE	dss.h	156;"	d
+DIST_TAG	dss.h	480;"	d
+DOC	makefile	/^DOC=README HISTORY PORTING.NOTES BUGS$/;"	m
+DOUBLE_CAST	config.h	177;"	d
+DOUBLE_CAST	config.h	71;"	d
+DSS_H	dss.h	8;"	d
+DSS_HUGE	config.h	106;"	d
+DSS_HUGE	config.h	139;"	d
+DSS_HUGE	config.h	172;"	d
+DSS_HUGE	config.h	64;"	d
+DSS_PROC	config.h	160;"	d
+DSS_PROC	config.h	52;"	d
+DT_CHR	dss.h	504;"	d
+DT_HUGE	dss.h	501;"	d
+DT_INT	dss.h	500;"	d
+DT_KEY	dss.h	502;"	d
+DT_MONEY	dss.h	503;"	d
+DT_STR	dss.h	494;"	d
+DT_VSTR	dss.h	496;"	d
+DT_VSTR	dss.h	498;"	d
+D_DATE_LEN	shared.h	31;"	d
+D_DAYWEEK_LEN	shared.h	32;"	d
+D_MONTH_LEN	shared.h	35;"	d
+D_SEASON_LEN	shared.h	34;"	d
+D_STARTDATE	shared.h	36;"	d
+D_YEARMONTH_LEN	shared.h	33;"	d
+ENDDATE	dss.h	427;"	d
+END_TRAN	tpcd.h	39;"	d
+END_TRAN	tpcd.h	48;"	d
+END_TRAN	tpcd.h	57;"	d
+END_TRAN	tpcd.h	66;"	d
+END_TRAN	tpcd.h	75;"	d
+EOL_HANDLING	config.h	111;"	d
+EXE	makefile	/^EXE     =$/;"	m
+EXPLAIN	tpcd.h	11;"	d
+EXTERN	dss.h	223;"	d
+EXTERN	dss.h	225;"	d
+Exponential	rnd.c	/^Exponential(double dMean, long nStream)$/;"	f
+FAKE_V_STR	speed_seed.c	8;"	d	file:
+FREE_HUGE	dss.h	447;"	d
+F_FatalError	speed_seed.c	/^void F_FatalError( int x, char *y, int z ) {fprintf(stderr, "Bang!\\n");}$/;"	f
+FatalError	speed_seed.c	93;"	d	file:
+GEN_QUERY_PLAN	tpcd.h	37;"	d
+GEN_QUERY_PLAN	tpcd.h	46;"	d
+GEN_QUERY_PLAN	tpcd.h	55;"	d
+GEN_QUERY_PLAN	tpcd.h	64;"	d
+GEN_QUERY_PLAN	tpcd.h	73;"	d
+GET_DIGIT	bcd2.c	20;"	d	file:
+HDR	makefile	/^HDR  = $(HDR1) $(HDR2)$/;"	m
+HDR1	makefile	/^HDR1 = dss.h rnd.h config.h dsstypes.h shared.h bcd2.h$/;"	m
+HDR2	makefile	/^HDR2 = tpcd.h permute.h$/;"	m
+HUGE2LONG	dss.h	450;"	d
+HUGE2LONG	dss.h	460;"	d
+HUGE_ADD	dss.h	454;"	d
+HUGE_ADD	dss.h	466;"	d
+HUGE_CMP	dss.h	457;"	d
+HUGE_CMP	dss.h	475;"	d
+HUGE_COUNT	config.h	107;"	d
+HUGE_COUNT	config.h	140;"	d
+HUGE_COUNT	config.h	173;"	d
+HUGE_COUNT	config.h	65;"	d
+HUGE_DIV	dss.h	453;"	d
+HUGE_DIV	dss.h	465;"	d
+HUGE_FORMAT	config.h	141;"	d
+HUGE_FORMAT	config.h	66;"	d
+HUGE_MOD	dss.h	456;"	d
+HUGE_MOD	dss.h	474;"	d
+HUGE_MUL	dss.h	452;"	d
+HUGE_MUL	dss.h	464;"	d
+HUGE_SET	dss.h	451;"	d
+HUGE_SET	dss.h	463;"	d
+HUGE_SUB	dss.h	455;"	d
+HUGE_SUB	dss.h	470;"	d
+HVAR_SD	dss.h	583;"	d
+INIT	tpcd.h	21;"	d
+INIT_HUGE	dss.h	443;"	d
+INTERNAL_ERROR	dss.h	82;"	d
+ITERATIONS	permute.c	67;"	d	file:
+JDAY	build.c	34;"	d	file:
+JDAY_BASE	build.c	32;"	d	file:
+JMNTH_BASE	build.c	33;"	d	file:
+JUNK	makefile	/^JUNK  = $/;"	m
+KILL	config.h	116;"	d
+KILL	config.h	125;"	d
+KILL	config.h	154;"	d
+LDFLAGS	makefile	/^LDFLAGS = -O$/;"	m
+LEAP	dss.h	411;"	d
+LEAP_ADJ	build.c	30;"	d	file:
+LIBS	makefile	/^LIBS    = -lm$/;"	m
+LIFENOISE	dss.h	85;"	d
+LINE	dss.h	63;"	d
+LINE_SIZE	qgen.c	25;"	d	file:
+LN_CNT	dss.h	83;"	d
+LOG	tpcd.h	15;"	d
+LONG2HUGE	dss.h	449;"	d
+LONG2HUGE	dss.h	459;"	d
+L_CDTE_MAX	dss.h	403;"	d
+L_CDTE_MIN	dss.h	402;"	d
+L_CDTE_SD	dss.h	571;"	d
+L_CMNT_LEN	shared.h	109;"	d
+L_CMNT_MAX	shared.h	110;"	d
+L_CMNT_SD	shared.h	134;"	d
+L_DCNT_MAX	dss.h	390;"	d
+L_DCNT_MIN	dss.h	389;"	d
+L_DCNT_SD	dss.h	564;"	d
+L_INST_LEN	shared.h	111;"	d
+L_PKEY_MAX	dss.h	395;"	d
+L_PKEY_MAX	dss.h	397;"	d
+L_PKEY_MIN	dss.h	391;"	d
+L_PKEY_SD	dss.h	568;"	d
+L_QTY_MAX	dss.h	386;"	d
+L_QTY_MIN	dss.h	385;"	d
+L_QTY_SD	dss.h	563;"	d
+L_RDTE_MAX	dss.h	405;"	d
+L_RDTE_MIN	dss.h	404;"	d
+L_RDTE_SD	dss.h	572;"	d
+L_RFLG_SD	dss.h	573;"	d
+L_SDTE_MAX	dss.h	401;"	d
+L_SDTE_MIN	dss.h	400;"	d
+L_SDTE_SD	dss.h	570;"	d
+L_SHIP_SD	dss.h	566;"	d
+L_SIZE	dss.h	384;"	d
+L_SKEY_MAX	dss.h	24;"	d
+L_SKEY_MIN	dss.h	23;"	d
+L_SKEY_SD	dss.h	569;"	d
+L_SMODE_LEN	shared.h	112;"	d
+L_SMODE_SD	dss.h	567;"	d
+L_TAX_MAX	dss.h	388;"	d
+L_TAX_MIN	dss.h	387;"	d
+L_TAX_SD	dss.h	565;"	d
+MACHINE	makefile	/^MACHINE =LINUX $/;"	m
+MALLOC_CHECK	dss.h	88;"	d
+MAX	dss.h	74;"	d
+MAX	dss.h	79;"	d
+MAXAGG_LEN	shared.h	120;"	d
+MAXAGG_LEN	shared.h	123;"	d
+MAX_32B_SCALE	dss.h	442;"	d
+MAX_CHILDREN	dss.h	103;"	d
+MAX_COLOR	speed_seed.c	14;"	d	file:
+MAX_GRAMMAR_LEN	dss.h	216;"	d
+MAX_LONG	dss.h	113;"	d
+MAX_PARAM	varsub.c	20;"	d	file:
+MAX_PERMUTE	tpcd.h	94;"	d
+MAX_PIDS	tpcd.h	84;"	d
+MAX_QUERY	permute.c	66;"	d	file:
+MAX_SCALE	dss.h	438;"	d
+MAX_SENT_LEN	dss.h	217;"	d
+MAX_STREAM	dss.h	430;"	d
+MAX_TABLE	dss.h	69;"	d
+MAX_VARS	tpcd.h	81;"	d
+MIN	dss.h	77;"	d
+MIN	dss.h	80;"	d
+MIN_SCALE	dss.h	437;"	d
+MK_SPARSE	dss.h	117;"	d
+MODIFICATION	dss.h	14;"	d
+MODIFICATION	dss.h	32;"	d
+MODIFICATION	dss.h	39;"	d
+MONEY_SCL	shared.h	9;"	d
+Modulus	speed_seed.c	/^static DSS_HUGE Modulus =  2147483647;   \/* trick you use to get 64 bit int *\/$/;"	v	file:
+Multiplier	speed_seed.c	/^static DSS_HUGE Multiplier = 16807;      \/* or whatever nonstandard *\/$/;"	v	file:
+NAME	dss.h	11;"	d
+NAME	dss.h	29;"	d
+NAME	dss.h	36;"	d
+NAMTION_BRIEF_LEN	shared.h	37;"	d
+NATION	dss.h	66;"	d
+NATIONS_MAX	dss.h	423;"	d
+NATION_LEN	shared.h	115;"	d
+NOATOM	driver.c	33;"	d	file:
+NOATOM	permute.c	34;"	d	file:
+NOATOM	text.c	35;"	d	file:
+NOCOMM	driver.c	45;"	d	file:
+NOCOMM	permute.c	46;"	d	file:
+NOCOMM	text.c	47;"	d	file:
+NOGDICAPMASKS	driver.c	34;"	d	file:
+NOGDICAPMASKS	permute.c	35;"	d	file:
+NOGDICAPMASKS	text.c	36;"	d	file:
+NOKANJI	driver.c	46;"	d	file:
+NOKANJI	permute.c	47;"	d	file:
+NOKANJI	text.c	48;"	d	file:
+NOMCX	driver.c	47;"	d	file:
+NOMCX	permute.c	48;"	d	file:
+NOMCX	text.c	49;"	d	file:
+NOMETAFILE	driver.c	35;"	d	file:
+NOMETAFILE	permute.c	36;"	d	file:
+NOMETAFILE	text.c	37;"	d	file:
+NOMINMAX	driver.c	36;"	d	file:
+NOMINMAX	permute.c	37;"	d	file:
+NOMINMAX	text.c	38;"	d	file:
+NOMSG	driver.c	37;"	d	file:
+NOMSG	permute.c	38;"	d	file:
+NOMSG	text.c	39;"	d	file:
+NONE	dss.h	57;"	d
+NOOPENFILE	driver.c	38;"	d	file:
+NOOPENFILE	permute.c	39;"	d	file:
+NOOPENFILE	text.c	40;"	d	file:
+NORASTEROPS	driver.c	39;"	d	file:
+NORASTEROPS	permute.c	40;"	d	file:
+NORASTEROPS	text.c	41;"	d	file:
+NOSCROLL	driver.c	40;"	d	file:
+NOSCROLL	permute.c	41;"	d	file:
+NOSCROLL	text.c	42;"	d	file:
+NOSOUND	driver.c	41;"	d	file:
+NOSOUND	permute.c	42;"	d	file:
+NOSOUND	text.c	43;"	d	file:
+NOSYSMETRICS	driver.c	42;"	d	file:
+NOSYSMETRICS	permute.c	43;"	d	file:
+NOSYSMETRICS	text.c	44;"	d	file:
+NOTEXTMETRIC	driver.c	43;"	d	file:
+NOTEXTMETRIC	permute.c	44;"	d	file:
+NOTEXTMETRIC	text.c	45;"	d	file:
+NOWH	driver.c	44;"	d	file:
+NOWH	permute.c	45;"	d	file:
+NOWH	text.c	46;"	d	file:
+NO_FUNC	driver.c	5;"	d	file:
+NO_LFUNC	driver.c	6;"	d	file:
+NUM_DAYS	shared.h	39;"	d
+NUM_HOLIDAYS	shared.h	41;"	d
+NUM_SEASONS	shared.h	40;"	d
+N_CMNT_LEN	shared.h	5;"	d
+N_CMNT_MAX	shared.h	6;"	d
+N_CMNT_SD	dss.h	585;"	d
+NextRand	rnd.c	/^NextRand(long nSeed)$/;"	f
+NthElement	speed_seed.c	/^NthElement ( long Shift, long *Seed)$/;"	f
+NthElement	speed_seed.c	/^void NthElement (long N, long *StartSeed)$/;"	f
+OBJ	makefile	/^OBJ     = .o$/;"	m
+OBJ1	makefile	/^OBJ1 = build$(OBJ) driver$(OBJ) bm_utils$(OBJ) rnd$(OBJ) print$(OBJ) \\$/;"	m
+OBJ2	makefile	/^OBJ2 = build$(OBJ) bm_utils$(OBJ) qgen$(OBJ) rnd$(OBJ) varsub$(OBJ) \\$/;"	m
+OBJS	makefile	/^OBJS = $(OBJ1) $(OBJ2)$/;"	m
+ONE_STREAM	dss.h	70;"	d
+OPEN_CHECK	dss.h	95;"	d
+ORDER	dss.h	62;"	d
+ORDERS_PER_CUST	dss.h	421;"	d
+ORDER_LINE	dss.h	64;"	d
+OTHER	makefile	/^OTHER=makefile.suite $(SETS) $(DDL) $/;"	m
+OUTPUT	tpcd.h	10;"	d
+O_CKEY_MAX	dss.h	371;"	d
+O_CKEY_MIN	dss.h	370;"	d
+O_CKEY_SD	dss.h	584;"	d
+O_CLRK_FMT	dss.h	376;"	d
+O_CLRK_LEN	shared.h	106;"	d
+O_CLRK_SCL	dss.h	377;"	d
+O_CLRK_SD	dss.h	561;"	d
+O_CLRK_TAG	dss.h	375;"	d
+O_CMNT_LEN	shared.h	107;"	d
+O_CMNT_MAX	shared.h	108;"	d
+O_CMNT_SD	shared.h	129;"	d
+O_CREAT	bm_utils.c	78;"	d	file:
+O_LCNT_MAX	dss.h	379;"	d
+O_LCNT_MIN	dss.h	378;"	d
+O_LCNT_SD	dss.h	587;"	d
+O_ODATE_MAX	dss.h	373;"	d
+O_ODATE_MIN	dss.h	372;"	d
+O_ODATE_SD	dss.h	562;"	d
+O_OPRIO_LEN	shared.h	102;"	d
+O_OPRIO_LEN	shared.h	99;"	d
+O_PRIO_SD	dss.h	582;"	d
+O_RDONLY	bm_utils.c	72;"	d	file:
+O_SHIP_MODE_LEN	shared.h	29;"	d
+O_SHIP_PRIO_LEN	shared.h	30;"	d
+O_SHIP_STRU_LEN	shared.h	28;"	d
+O_SIZE	dss.h	369;"	d
+O_SUPP_SD	dss.h	560;"	d
+O_WRONLY	bm_utils.c	75;"	d	file:
+PART	dss.h	58;"	d
+PART_PSUPP	dss.h	65;"	d
+PART_SUPP_BRIDGE	build.c	35;"	d	file:
+PATCH	dss.h	15;"	d
+PATCH	dss.h	33;"	d
+PATCH	dss.h	40;"	d
+PATH_DFLT	dss.h	483;"	d
+PATH_SEP	config.h	136;"	d
+PATH_SEP	config.h	168;"	d
+PATH_SEP	config.h	53;"	d
+PATH_TAG	dss.h	482;"	d
+PENNIES	dss.h	432;"	d
+PHONE_FMT	dss.h	424;"	d
+PHONE_LEN	shared.h	117;"	d
+PROG1	makefile	/^PROG1 = dbgen$(EXE)$/;"	m
+PROG2	makefile	/^PROG2 = qgen$(EXE)$/;"	m
+PROGS	makefile	/^PROGS = $(PROG1) $(PROG2)$/;"	m
+PROTO	bm_utils.c	89;"	d	file:
+PROTO	bm_utils.c	91;"	d	file:
+PROTO	dss.h	181;"	d
+PROTO	dss.h	183;"	d
+PR_CHR	dss.h	514;"	d
+PR_DATE	dss.h	519;"	d
+PR_DATE	dss.h	523;"	d
+PR_DATE	dss.h	526;"	d
+PR_END	dss.h	516;"	d
+PR_HUGE	dss.h	511;"	d
+PR_INT	dss.h	510;"	d
+PR_KEY	dss.h	512;"	d
+PR_MONEY	dss.h	513;"	d
+PR_STR	dss.h	507;"	d
+PR_STRT	dss.h	515;"	d
+PR_VSTR	dss.h	508;"	d
+PR_VSTR_LAST	dss.h	509;"	d
+PSUPP	dss.h	59;"	d
+PS_CMNT_LEN	shared.h	79;"	d
+PS_CMNT_MAX	shared.h	80;"	d
+PS_CMNT_SD	shared.h	128;"	d
+PS_QTY_MAX	dss.h	356;"	d
+PS_QTY_MIN	dss.h	355;"	d
+PS_QTY_SD	dss.h	558;"	d
+PS_SCST_MAX	dss.h	354;"	d
+PS_SCST_MIN	dss.h	353;"	d
+PS_SCST_SD	dss.h	559;"	d
+PS_SIZE	dss.h	350;"	d
+PS_SKEY_MAX	dss.h	352;"	d
+PS_SKEY_MIN	dss.h	351;"	d
+P_BRND_FMT	dss.h	312;"	d
+P_BRND_LEN	shared.h	50;"	d
+P_BRND_MAX	dss.h	318;"	d
+P_BRND_MIN	dss.h	313;"	d
+P_BRND_SD	dss.h	553;"	d
+P_BRND_TAG	dss.h	311;"	d
+P_CAT_LEN	shared.h	18;"	d
+P_CAT_MAX	shared.h	20;"	d
+P_CAT_MIN	shared.h	19;"	d
+P_CAT_SD	shared.h	21;"	d
+P_CAT_SEED	shared.h	64;"	d
+P_CMNT_LEN	shared.h	62;"	d
+P_CMNT_MAX	shared.h	63;"	d
+P_CMNT_SD	shared.h	127;"	d
+P_CNTR_LEN	shared.h	61;"	d
+P_CNTR_SD	dss.h	556;"	d
+P_COLOR_LEN	shared.h	15;"	d
+P_COLOR_MAX	shared.h	16;"	d
+P_MCST_MAX	dss.h	324;"	d
+P_MCST_MIN	dss.h	323;"	d
+P_MCST_SCL	dss.h	325;"	d
+P_MFG_FMT	dss.h	308;"	d
+P_MFG_LEN	shared.h	14;"	d
+P_MFG_LEN	shared.h	46;"	d
+P_MFG_MAX	dss.h	310;"	d
+P_MFG_MIN	dss.h	309;"	d
+P_MFG_SD	dss.h	552;"	d
+P_MFG_TAG	dss.h	307;"	d
+P_NAME_LEN	shared.h	13;"	d
+P_NAME_LEN	shared.h	45;"	d
+P_NAME_SCL	dss.h	303;"	d
+P_NAME_SCL	dss.h	305;"	d
+P_NAME_SD	dss.h	581;"	d
+P_RCST_MAX	dss.h	327;"	d
+P_RCST_MIN	dss.h	326;"	d
+P_RCST_SCL	dss.h	328;"	d
+P_RCST_SD	dss.h	557;"	d
+P_SIZE	dss.h	301;"	d
+P_SIZE_MAX	dss.h	322;"	d
+P_SIZE_MIN	dss.h	321;"	d
+P_SIZE_SD	dss.h	555;"	d
+P_TYPE_LEN	shared.h	53;"	d
+P_TYPE_LEN	shared.h	57;"	d
+P_TYPE_MAX	shared.h	17;"	d
+P_TYPE_SD	dss.h	554;"	d
+Q11_FRACTION	dss.h	433;"	d
+QD	makefile	/^QD=1.sql 2.sql 3.sql 4.sql 5.sql 6.sql 7.sql 8.sql 9.sql 10.sql \\$/;"	m
+QDIR_DFLT	tpcd.h	31;"	d
+QDIR_TAG	tpcd.h	30;"	d
+QLEN_MAX	tpcd.h	82;"	d
+QSRC	makefile	/^QSRC  = $(FQD) $(VARIANTS)$/;"	m
+QUERIES_PER_SET	tpcd.h	83;"	d
+QUERY	tpcd.h	16;"	d
+RANDOM	dss.h	120;"	d
+REFRESH	tpcd.h	17;"	d
+REGION	dss.h	67;"	d
+REGION_LEN	shared.h	116;"	d
+RELEASE	dss.h	13;"	d
+RELEASE	dss.h	31;"	d
+RELEASE	dss.h	38;"	d
+RND	speed_seed.c	/^typedef signed long RND;$/;"	t	file:
+RND_ModMult	speed_seed.c	/^RND_ModMult ( RND nA, RND nB)$/;"	f	file:
+RND_seed	speed_seed.c	/^RND_seed ( RND Order            )$/;"	f
+RNG_PER_SENT	dss.h	218;"	d
+RPRICE_BRIDGE	build.c	41;"	d	file:
+R_CMNT_LEN	shared.h	7;"	d
+R_CMNT_MAX	shared.h	8;"	d
+R_CMNT_SD	dss.h	586;"	d
+SEED	tpcd.h	19;"	d
+SEED_T	dss.h	/^typedef struct SEED_T {$/;"	s
+SEPARATOR	dss.h	491;"	d
+SEQUENCE	tpcd.h	99;"	d
+SETS	makefile	/^SETS = dists.dss $/;"	m
+SET_DBASE	tpcd.h	42;"	d
+SET_DBASE	tpcd.h	51;"	d
+SET_DBASE	tpcd.h	60;"	d
+SET_DBASE	tpcd.h	69;"	d
+SET_DBASE	tpcd.h	78;"	d
+SET_DIGIT	bcd2.c	25;"	d	file:
+SET_HANDLER	config.h	117;"	d
+SET_HANDLER	config.h	124;"	d
+SET_HANDLER	config.h	155;"	d
+SET_OUTPUT	tpcd.h	40;"	d
+SET_OUTPUT	tpcd.h	49;"	d
+SET_OUTPUT	tpcd.h	58;"	d
+SET_OUTPUT	tpcd.h	67;"	d
+SET_OUTPUT	tpcd.h	76;"	d
+SET_ROWCOUNT	tpcd.h	41;"	d
+SET_ROWCOUNT	tpcd.h	50;"	d
+SET_ROWCOUNT	tpcd.h	59;"	d
+SET_ROWCOUNT	tpcd.h	68;"	d
+SET_ROWCOUNT	tpcd.h	77;"	d
+SIGS_DEFINED	config.h	119;"	d
+SIGS_DEFINED	config.h	135;"	d
+SPARSE_BITS	dss.h	115;"	d
+SPARSE_KEEP	dss.h	116;"	d
+SPAWN	config.h	115;"	d
+SPAWN	config.h	128;"	d
+SPAWN	config.h	131;"	d
+SPAWN	config.h	156;"	d
+SRC	makefile	/^SRC  = $(SRC1) $(SRC2)$/;"	m
+SRC1	makefile	/^SRC1 = build.c driver.c bm_utils.c rnd.c print.c load_stub.c bcd2.c \\$/;"	m
+SRC2	makefile	/^SRC2 = qgen.c varsub.c $/;"	m
+STARTDATE	dss.h	425;"	d
+START_TRAN	tpcd.h	38;"	d
+START_TRAN	tpcd.h	47;"	d
+START_TRAN	tpcd.h	56;"	d
+START_TRAN	tpcd.h	65;"	d
+START_TRAN	tpcd.h	74;"	d
+STDLIB_HAS_GETOPT	config.h	100;"	d
+STDLIB_HAS_GETOPT	config.h	104;"	d
+STDLIB_HAS_GETOPT	config.h	58;"	d
+STDLIB_HAS_GETOPT	config.h	76;"	d
+STDLIB_HAS_GETOPT	config.h	88;"	d
+STDLIB_HAS_GETOPT	config.h	92;"	d
+STDLIB_HAS_GETOPT	config.h	96;"	d
+SUPP	dss.h	60;"	d
+SUPPORT_64BITS	config.h	105;"	d
+SUPPORT_64BITS	config.h	138;"	d
+SUPPORT_64BITS	config.h	63;"	d
+SUPP_PER_PART	dss.h	420;"	d
+S_ABAL_MAX	dss.h	336;"	d
+S_ABAL_MIN	dss.h	335;"	d
+S_ABAL_SD	dss.h	580;"	d
+S_ADDR_LEN	shared.h	69;"	d
+S_ADDR_LEN	shared.h	73;"	d
+S_ADDR_MAX	shared.h	70;"	d
+S_ADDR_MAX	shared.h	74;"	d
+S_ADDR_SD	shared.h	132;"	d
+S_CMNT_BBB	dss.h	338;"	d
+S_CMNT_LEN	shared.h	77;"	d
+S_CMNT_MAX	dss.h	337;"	d
+S_CMNT_MAX	shared.h	78;"	d
+S_CMNT_SD	shared.h	133;"	d
+S_ISFIFO	bm_utils.c	58;"	d	file:
+S_ISREG	bm_utils.c	57;"	d	file:
+S_NAME_FMT	dss.h	334;"	d
+S_NAME_LEN	shared.h	66;"	d
+S_NAME_TAG	dss.h	333;"	d
+S_NATION_NAME_LEN	shared.h	22;"	d
+S_NTRG_SD	dss.h	578;"	d
+S_PHNE_SD	dss.h	579;"	d
+S_REGION_NAME_LEN	shared.h	23;"	d
+S_SIZE	dss.h	332;"	d
+Seed	rnd.h	/^seed_t     Seed[MAX_STREAM + 1] =$/;"	v
+TERMINATE	tpcd.h	22;"	d
+TEST_RES	makefile	/^TEST_RES = O.res L.res c.res s.res P.res S.res n.res r.res$/;"	m
+TEXT	build.c	44;"	d	file:
+TIMING	tpcd.h	14;"	d
+TOTDATE	dss.h	428;"	d
+TPC	dss.h	45;"	d
+TREE_DOC	makefile	/^TREE_DOC=tree.readme tree.changes appendix.readme appendix.version answers.readme queries.readme variants.readme$/;"	m
+TREE_ROOT	makefile	/^TREE_ROOT=\/tmp\/tree$/;"	m
+T_ALPHA_LEN	shared.h	113;"	d
+T_SIZE	dss.h	409;"	d
+T_START_DAY	dss.h	410;"	d
+UNIFORM	dss.h	547;"	d
+UNSET	permute.c	68;"	d	file:
+UPDATE	dss.h	68;"	d
+UPD_PCT	dss.h	429;"	d
+URND	speed_seed.c	/^typedef unsigned long URND;$/;"	t	file:
+UnifInt	rnd.c	/^UnifInt(long nLow, long nHigh, long nStream)$/;"	f
+UnifReal	rnd.c	/^UnifReal(double dLow, double dHigh, long nStream)$/;"	f
+VARIANTS	makefile	/^VARIANTS= 8a.sql 12a.sql 13a.sql 14a.sql 15a.sql $/;"	m
+VERBOSE	tpcd.h	13;"	d
+VERSION	dss.h	12;"	d
+VERSION	dss.h	30;"	d
+VERSION	dss.h	37;"	d
+VRF_CHR	dss.h	542;"	d
+VRF_END	dss.h	544;"	d
+VRF_HUGE	dss.h	536;"	d
+VRF_HUGE	dss.h	538;"	d
+VRF_INT	dss.h	534;"	d
+VRF_MONEY	dss.h	541;"	d
+VRF_STR	dss.h	533;"	d
+VRF_STRT	dss.h	543;"	d
+VSTR_MAX	rnd.h	29;"	d
+VTAG	tpcd.h	28;"	d
+V_STR	build.c	42;"	d	file:
+V_STR_HGH	shared.h	10;"	d
+V_STR_LOW	dss.h	431;"	d
+WAIT	config.h	118;"	d
+WAIT	config.h	129;"	d
+WAIT	config.h	132;"	d
+WAIT	config.h	157;"	d
+WIFEXITED	config.h	146;"	d
+WIFSIGNALED	config.h	147;"	d
+WIFSTOPPED	config.h	148;"	d
+WIN32	config.h	60;"	d
+WIN32_LEAN_AND_MEAN	driver.c	32;"	d	file:
+WIN32_LEAN_AND_MEAN	permute.c	33;"	d	file:
+WIN32_LEAN_AND_MEAN	text.c	34;"	d	file:
+WORD_DIVISOR	bcd2.c	19;"	d	file:
+WORKLOAD	makefile	/^WORKLOAD =SSBM $/;"	m
+WSTOPSIG	config.h	150;"	d
+WTERMSIG	config.h	149;"	d
+_GNU_SOURCE	bm_utils.c	26;"	d	file:
+_INCLUDE_POSIX_SOURCE	config.h	75;"	d
+_POSIX_SOURCE	config.h	80;"	d
+a_rnd	bm_utils.c	/^a_rnd(int min, int max, int column, char *dest)$/;"	f
+acctbal	dsstypes.h	/^    long            acctbal;$/;"	m	struct:__anon17
+acctbal	dsstypes.h	/^    long            acctbal;$/;"	m	struct:__anon8
+address	dsstypes.h	/^    char            address[C_ADDR_MAX + 1];$/;"	m	struct:__anon7
+address	dsstypes.h	/^    char            address[C_ADDR_MAX + 1];$/;"	m	struct:__anon8
+address	dsstypes.h	/^    char            address[S_ADDR_MAX + 1];$/;"	m	struct:__anon16
+address	dsstypes.h	/^    char            address[S_ADDR_MAX + 1];$/;"	m	struct:__anon17
+adjectives	dss.h	/^EXTERN distribution adjectives;$/;"	v
+adverbs	dss.h	/^EXTERN distribution adverbs;$/;"	v
+agg_str	bm_utils.c	/^agg_str(distribution *set, long count, long col, char *dest)$/;"	f
+alen	dsstypes.h	/^    int             alen; $/;"	m	struct:__anon16
+alen	dsstypes.h	/^    int             alen;$/;"	m	struct:__anon17
+alen	dsstypes.h	/^    int             alen;$/;"	m	struct:__anon7
+alen	dsstypes.h	/^    int             alen;$/;"	m	struct:__anon8
+alpha	dsstypes.h	/^    char            alpha[DATE_LEN];$/;"	m	struct:__anon19
+alpha_num	bm_utils.c	/^static char alpha_num[65] =$/;"	v	file:
+articles	dss.h	/^EXTERN distribution articles;$/;"	v
+asc_date	qgen.c	/^char **asc_date;$/;"	v
+auxillaries	dss.h	/^EXTERN distribution auxillaries;$/;"	v
+base	dss.h	/^   long      base;$/;"	m	struct:__anon5
+bcd2_add	bcd2.c	/^bcd2_add(long *bcd_low, long *bcd_high, long addend)$/;"	f
+bcd2_bin	bcd2.c	/^bcd2_bin(long *dest, long bcd)$/;"	f
+bcd2_cmp	bcd2.c	/^bcd2_cmp(long *low1, long *high1, long comp)$/;"	f
+bcd2_div	bcd2.c	/^bcd2_div(long *bcd_low, long *bcd_high, long divisor)$/;"	f
+bcd2_mod	bcd2.c	/^bcd2_mod(long *bcd_low, long *bcd_high, long modulo)$/;"	f
+bcd2_mul	bcd2.c	/^bcd2_mul(long *bcd_low, long *bcd_high, long multiplier)$/;"	f
+bcd2_sub	bcd2.c	/^bcd2_sub(long *bcd_low, long *bcd_high, long subend)$/;"	f
+bin_bcd2	bcd2.c	/^bin_bcd2(long binary, long *low_res, long *high_res)$/;"	f
+boundary	dss.h	/^	long boundary;$/;"	m	struct:SEED_T
+brand	dsstypes.h	/^    char           brand[P_BRND_LEN + 1];$/;"	m	struct:__anon14
+brand	dsstypes.h	/^    char           brand[P_BRND_LEN + 1];$/;"	m	struct:__anon15
+brands	varsub.c	/^long brands[25] = {11,12,13,14,15,21,22,23,24,25,31,32,33,34,35,$/;"	v
+c_mseg_set	dss.h	/^EXTERN distribution c_mseg_set;$/;"	v
+category	dsstypes.h	/^    char           category[P_CAT_LEN + 1];$/;"	m	struct:__anon14
+ccode	varsub.c	/^long ccode[25] = {10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34};$/;"	v
+cdate	dsstypes.h	/^    char            cdate[DATE_LEN];$/;"	m	struct:__anon10
+child	dss.h	/^   int       child;$/;"	m	struct:__anon5
+children	dss.h	/^EXTERN long children;$/;"	v
+city	dsstypes.h	/^    char            city[CITY_FIX +1];$/;"	m	struct:__anon16
+city	dsstypes.h	/^    char            city[CITY_FIX+1];$/;"	m	struct:__anon7
+clen	dsstypes.h	/^    int             clen;$/;"	m	struct:__anon17
+clen	dsstypes.h	/^    int             clen;$/;"	m	struct:__anon20
+clen	dsstypes.h	/^    int             clen;$/;"	m	struct:__anon8
+clen	dsstypes.h	/^    int            clen;$/;"	m	struct:__anon10
+clen	dsstypes.h	/^    int            clen;$/;"	m	struct:__anon12
+clen	dsstypes.h	/^    int            clen;$/;"	m	struct:__anon13
+clen	dsstypes.h	/^    int            clen;$/;"	m	struct:__anon14
+clen	dsstypes.h	/^    int            clen;$/;"	m	struct:__anon15
+clerk	dsstypes.h	/^    char            clerk[O_CLRK_LEN + 1];$/;"	m	struct:__anon11
+clerk	dsstypes.h	/^    char            clerk[O_CLRK_LEN + 1];$/;"	m	struct:__anon12
+close_direct	load_stub.c	/^close_direct(void)$/;"	f
+code	dsstypes.h	/^    long            code;$/;"	m	struct:__anon20
+code_t	dsstypes.h	/^}               code_t;$/;"	t	typeref:struct:__anon20
+color	dsstypes.h	/^    char           color[P_COLOR_MAX + 1];$/;"	m	struct:__anon14
+colors	dss.h	/^EXTERN distribution colors;$/;"	v
+columnar	dss.h	/^EXTERN long columnar;$/;"	v
+comment	dss.h	/^   char     *comment;$/;"	m	struct:__anon5
+comment	dsstypes.h	/^    char            comment[C_CMNT_MAX + 1];$/;"	m	struct:__anon8
+comment	dsstypes.h	/^    char            comment[N_CMNT_MAX + 1];$/;"	m	struct:__anon20
+comment	dsstypes.h	/^    char            comment[O_CMNT_MAX + 1];$/;"	m	struct:__anon12
+comment	dsstypes.h	/^    char            comment[S_CMNT_MAX + 1];$/;"	m	struct:__anon17
+comment	dsstypes.h	/^    char           comment[L_CMNT_MAX + 1];$/;"	m	struct:__anon10
+comment	dsstypes.h	/^    char           comment[PS_CMNT_MAX + 1];$/;"	m	struct:__anon13
+comment	dsstypes.h	/^    char           comment[P_CMNT_MAX + 1];$/;"	m	struct:__anon15
+commit_date	dsstypes.h	/^    char            commit_date[DATE_LEN] ;$/;"	m	struct:__anon9
+container	dsstypes.h	/^    char           container[P_CNTR_LEN + 1];$/;"	m	struct:__anon14
+container	dsstypes.h	/^    char           container[P_CNTR_LEN + 1];$/;"	m	struct:__anon15
+count	dss.h	/^   int      count;$/;"	m	struct:__anon4
+custkey	dsstypes.h	/^    long            custkey;$/;"	m	struct:__anon11
+custkey	dsstypes.h	/^    long            custkey;$/;"	m	struct:__anon12
+custkey	dsstypes.h	/^    long            custkey;$/;"	m	struct:__anon7
+custkey	dsstypes.h	/^    long            custkey;$/;"	m	struct:__anon8
+custkey	dsstypes.h	/^    long            custkey;$/;"	m	struct:__anon9
+customer_t	dsstypes.h	/^}               customer_t;$/;"	t	typeref:struct:__anon7
+customer_t	dsstypes.h	/^}               customer_t;$/;"	t	typeref:struct:__anon8
+dM	rnd.h	/^static double   dM = 2147483647.0;$/;"	v
+d_path	dss.h	/^EXTERN char *d_path;$/;"	v
+date	dsstypes.h	/^   char            date[D_DATE_LEN+1];$/;"	m	struct:__anon18
+date_t	dsstypes.h	/^}      date_t;$/;"	t	typeref:struct:__anon18
+datekey	dsstypes.h	/^   long            datekey;$/;"	m	struct:__anon18
+day	dss.h	/^  int day;$/;"	m	struct:__anon2
+day	dsstypes.h	/^    long            day;$/;"	m	struct:__anon19
+daynuminmonth	dsstypes.h	/^   int             daynuminmonth;$/;"	m	struct:__anon18
+daynuminweek	dsstypes.h	/^   int             daynuminweek;$/;"	m	struct:__anon18
+daynuminyear	dsstypes.h	/^   int             daynuminyear;$/;"	m	struct:__anon18
+dayofweek	dsstypes.h	/^   char            dayofweek[D_DAYWEEK_LEN+1] ;$/;"	m	struct:__anon18
+days	build.c	/^	long      days;$/;"	m	struct:__anon6	file:
+days_in_a_month	build.c	/^int days_in_a_month[12]={31,28,31,30,31,30,31,31,30,31,30,31};$/;"	v
+days_in_a_month_l	build.c	/^int days_in_a_month_l[12]={31,29,31,30,31,30,31,31,30,31,30,31};$/;"	v
+db_name	dss.h	/^EXTERN char *db_name;$/;"	v
+dbg_print	print.c	/^dbg_print(int format, FILE *target, void *data, int len, int sep)$/;"	f
+dbg_text	text.c	/^dbg_text(char *tgt, int min, int max, int sd)$/;"	f
+dcnt	build.c	/^	long      dcnt;$/;"	m	struct:__anon6	file:
+defaults	varsub.c	/^char *defaults[24][11] =$/;"	v
+delete_segment	dss.h	/^EXTERN int delete_segment;$/;"	v
+delete_segments	dss.h	/^EXTERN int delete_segments;$/;"	v
+direct	dss.h	/^EXTERN long direct;$/;"	v
+discount	dsstypes.h	/^    long            discount;$/;"	m	struct:__anon10
+discount	dsstypes.h	/^    long           discount;$/;"	m	struct:__anon9
+distribution	dss.h	/^}         distribution;$/;"	t	typeref:struct:__anon4
+dss_random	rnd.c	/^dss_random(long *tgt, long lower, long upper, long stream)$/;"	f
+dss_time_t	dsstypes.h	/^} dss_time_t;               $/;"	t	typeref:struct:__anon19
+dsscasecmp	bm_utils.c	/^dsscasecmp(char *s1, char *s2)$/;"	f
+dssncasecmp	bm_utils.c	/^dssncasecmp(char *s1, char *s2, int n)$/;"	f
+dump_seeds	rnd.c	/^dump_seeds(int tbl)$/;"	f
+e_str	bm_utils.c	/^e_str(distribution *d, int min, int max, int stream, char *dest)$/;"	f
+end_day	dss.h	/^  int end_day;$/;"	m	struct:__anon1
+end_month	dss.h	/^  int end_month;$/;"	m	struct:__anon1
+env_config	bm_utils.c	/^env_config(char *var, char *dflt)$/;"	f
+eol	permute.c	/^char *eol[2] = {" ", "},"};$/;"	v
+eprice	dsstypes.h	/^    long            eprice;$/;"	m	struct:__anon10
+extended_price	dsstypes.h	/^    long           extended_price;$/;"	m	struct:__anon9
+ez_sparse	build.c	/^ez_sparse(long i, DSS_HUGE *ok, long seq)$/;"	f
+fake_a_rnd	speed_seed.c	/^fake_a_rnd(int min, int max, int column)$/;"	f
+fdopen	bm_utils.c	66;"	d	file:
+flags	tpcd.h	/^EXTERN int flags;$/;"	v
+flt_scale	driver.c	/^double flt_scale;$/;"	v
+flt_scale	qgen.c	/^double flt_scale;$/;"	v
+fnames	dss.h	/^EXTERN long fnames;$/;"	v
+force	dss.h	/^EXTERN long force;$/;"	v
+gen_category	build.c	/^gen_category(char *target, long seed){$/;"	f	file:
+gen_city	build.c	/^int gen_city(char *cityName, char *nationName){$/;"	f
+gen_color	build.c	/^int gen_color(char * source, char * dest){$/;"	f
+gen_holiday_fl	build.c	/^int gen_holiday_fl(char * dest, int month, int day){$/;"	f
+gen_phone	build.c	/^gen_phone(long ind, char *target, long seed)$/;"	f	file:
+gen_rng	dss.h	/^EXTERN int  gen_rng;$/;"	v
+gen_season	build.c	/^int gen_season(char * dest,int month,int day)$/;"	f
+gen_seed	dss.h	/^   long      (*gen_seed)();$/;"	m	struct:__anon5
+gen_sql	dss.h	/^EXTERN int  gen_sql;$/;"	v
+gen_tbl	driver.c	/^gen_tbl (int tnum, long start, long count, long upd_num)$/;"	f
+getopt	bm_utils.c	/^getopt(int ac, char **av, char *opt)$/;"	f
+getpid	config.h	133;"	d
+grammar	dss.h	/^EXTERN distribution grammar;$/;"	v
+hd_cust	load_stub.c	/^hd_cust (FILE *f)$/;"	f
+hd_line	load_stub.c	/^hd_line (FILE *f)$/;"	f
+hd_nation	load_stub.c	/^hd_nation (FILE *f)$/;"	f
+hd_order	load_stub.c	/^hd_order (FILE *f)$/;"	f
+hd_order_line	load_stub.c	/^hd_order_line (FILE *f)$/;"	f
+hd_part	load_stub.c	/^hd_part (FILE *f)$/;"	f
+hd_part_psupp	load_stub.c	/^hd_part_psupp (FILE *f)$/;"	f
+hd_psupp	load_stub.c	/^hd_psupp (FILE *f)$/;"	f
+hd_region	load_stub.c	/^hd_region (FILE *f)$/;"	f
+hd_sparse	build.c	/^hd_sparse(long i, DSS_HUGE *ok, long seq)$/;"	f
+hd_supp	load_stub.c	/^hd_supp (FILE *f)$/;"	f
+header	dss.h	/^   int       (*header) ();$/;"	m	struct:__anon5
+header	dss.h	/^EXTERN long header;$/;"	v
+holiday	dss.h	/^} holiday;$/;"	t	typeref:struct:__anon2
+holidayfl	dsstypes.h	/^   char            holidayfl[2];$/;"	m	struct:__anon18
+holidays	build.c	/^holiday holidays[]={$/;"	v
+ifile	tpcd.h	/^EXTERN char *ifile;$/;"	v
+insert_lineitem_segment	dss.h	/^EXTERN int insert_lineitem_segment;$/;"	v
+insert_orders_segment	dss.h	/^EXTERN int insert_orders_segment;$/;"	v
+insert_segments	dss.h	/^EXTERN int insert_segments;$/;"	v
+is_last_day_in_month	build.c	/^is_last_day_in_month(int year,int month,int day){$/;"	f
+join	dsstypes.h	/^    long            join;$/;"	m	struct:__anon20
+julian	bm_utils.c	/^julian(long date)$/;"	f
+kill_load	driver.c	/^kill_load (void)$/;"	f
+l	dsstypes.h	/^    line_t          l[O_LCNT_MAX];$/;"	m	struct:__anon12
+l_category_set	dss.h	/^EXTERN distribution l_category_set;$/;"	v
+l_instruct_set	dss.h	/^EXTERN distribution l_instruct_set;$/;"	v
+l_rflag_set	dss.h	/^EXTERN distribution l_rflag_set;$/;"	v
+l_smode_set	dss.h	/^EXTERN distribution l_smode_set;$/;"	v
+lastdayinmonthfl	dsstypes.h	/^   char            lastdayinmonthfl[2];$/;"	m	struct:__anon18
+lastdayinweekfl	dsstypes.h	/^   char            lastdayinweekfl[2];$/;"	m	struct:__anon18
+lcnt	dsstypes.h	/^    long            lcnt;$/;"	m	struct:__anon10
+ld_cust	load_stub.c	/^ld_cust (customer_t *cp, int mode)$/;"	f
+ld_date	load_stub.c	/^ld_date (date_t *d, int mode)$/;"	f
+ld_line	load_stub.c	/^ld_line (order_t *p, int mode)$/;"	f
+ld_nation	load_stub.c	/^ld_nation (code_t *cp, int mode)$/;"	f
+ld_order	load_stub.c	/^ld_order (order_t *p, int mode)$/;"	f
+ld_order_line	load_stub.c	/^ld_order_line (order_t *p, int mode)$/;"	f
+ld_part	load_stub.c	/^ld_part (part_t *pp, int mode)$/;"	f
+ld_part_psupp	load_stub.c	/^ld_part_psupp (part_t *p, int mode)$/;"	f
+ld_psupp	load_stub.c	/^ld_psupp (part_t *pp, int mode)$/;"	f
+ld_region	load_stub.c	/^ld_region (code_t *cp, int mode)$/;"	f
+ld_supp	load_stub.c	/^ld_supp (supplier_t *sp, int mode)$/;"	f
+lfile	tpcd.h	/^EXTERN char *lfile;$/;"	v
+line_t	dsstypes.h	/^}               line_t;$/;"	t	typeref:struct:__anon10
+linenumber	dsstypes.h	/^    int             linenumber; \/*integer, constrain to max of 7*\/$/;"	m	struct:__anon9
+lineorder_t	dsstypes.h	/^}  lineorder_t;$/;"	t	typeref:struct:__anon9
+lineorders	dsstypes.h	/^    lineorder_t     lineorders[O_LCNT_MAX];$/;"	m	struct:__anon11
+lines	dsstypes.h	/^    long            lines;$/;"	m	struct:__anon11
+lines	dsstypes.h	/^    long            lines;$/;"	m	struct:__anon12
+list	dss.h	/^   set_member *list;$/;"	m	struct:__anon4
+lnoise	dss.h	/^static char lnoise[4] = {'|', '\/', '-', '\\\\' };$/;"	v
+load_dists	driver.c	/^load_dists (void)$/;"	f
+loader	dss.h	/^   int       (*loader[2]) ();$/;"	m	struct:__anon5
+lstatus	dsstypes.h	/^    char            lstatus[1];$/;"	m	struct:__anon10
+main	bcd2.c	/^main()$/;"	f
+main	driver.c	/^main (int ac, char **av)$/;"	f
+main	permute.c	/^main(int ac, char *av[])$/;"	f
+main	qgen.c	/^main(int ac, char **av)$/;"	f
+main	text.c	/^main()$/;"	f
+max	dss.h	/^   int      max;$/;"	m	struct:__anon4
+mdes	build.c	/^	char     *mdes;$/;"	m	struct:__anon6	file:
+mfgr	dsstypes.h	/^    char           mfgr[P_MFG_LEN + 1];$/;"	m	struct:__anon14
+mfgr	dsstypes.h	/^    char           mfgr[P_MFG_LEN + 1];$/;"	m	struct:__anon15
+minrow	driver.c	/^long rowcnt = 0, minrow = 0, upd_num = 0;$/;"	v
+mk_ascdate	bm_utils.c	/^mk_ascdate(void)$/;"	f
+mk_cust	build.c	/^long mk_cust(long n_cust, customer_t *c)$/;"	f
+mk_cust	build.c	/^mk_cust(long n_cust, customer_t *c)$/;"	f
+mk_date	build.c	/^mk_date(long index,date_t *d)$/;"	f
+mk_nation	build.c	/^		mk_nation(long index, code_t *c)$/;"	f
+mk_order	build.c	/^mk_order(long index, order_t *o, long upd_num)$/;"	f
+mk_part	build.c	/^long mk_part(long index, part_t *p)$/;"	f
+mk_part	build.c	/^mk_part(long index, part_t *p)$/;"	f
+mk_region	build.c	/^		mk_region(long index, code_t *c)$/;"	f
+mk_sparse	build.c	/^mk_sparse (long i, DSS_HUGE *ok, long seq)$/;"	f
+mk_supp	build.c	/^mk_supp(long index, supplier_t *s)$/;"	f
+mk_time	build.c	/^mk_time(long index, dss_time_t *t)$/;"	f
+mktsegment	dsstypes.h	/^    char            mktsegment[MAXAGG_LEN + 1];$/;"	m	struct:__anon7
+mktsegment	dsstypes.h	/^    char            mktsegment[MAXAGG_LEN + 1];$/;"	m	struct:__anon8
+month	dss.h	/^  int month;$/;"	m	struct:__anon2
+month	dsstypes.h	/^    long            month;$/;"	m	struct:__anon19
+month	dsstypes.h	/^   char            month[D_MONTH_LEN+1];$/;"	m	struct:__anon18
+month_names	build.c	/^char * month_names[]={"January","February","March","April",$/;"	v
+monthnuminyear	dsstypes.h	/^   int             monthnuminyear;$/;"	m	struct:__anon18
+months	build.c	/^months[] =$/;"	v	typeref:struct:__anon6
+nA	rnd.h	/^static long     nA = 16807;     \/* the multiplier *\/$/;"	v
+nM	rnd.h	/^static long     nM = 2147483647;\/* the modulus == 2^31 - 1 *\/$/;"	v
+nQ	rnd.h	/^static long     nQ = 127773;    \/* the quotient nM \/ nA *\/$/;"	v
+nR	rnd.h	/^static long     nR = 2836;      \/* the remainder nM % nA *\/$/;"	v
+name	dss.h	/^   char     *name;$/;"	m	struct:__anon5
+name	dss.h	/^  char * name;$/;"	m	struct:__anon1
+name	dss.h	/^  char * name;$/;"	m	struct:__anon2
+name	dsstypes.h	/^    char            name[C_NAME_LEN + 1];$/;"	m	struct:__anon7
+name	dsstypes.h	/^    char            name[C_NAME_LEN + 1];$/;"	m	struct:__anon8
+name	dsstypes.h	/^    char            name[S_NAME_LEN + 1];$/;"	m	struct:__anon16
+name	dsstypes.h	/^    char            name[S_NAME_LEN + 1];$/;"	m	struct:__anon17
+name	dsstypes.h	/^    char           name[P_NAME_LEN + 1];$/;"	m	struct:__anon14
+name	dsstypes.h	/^    char           name[P_NAME_LEN + 1];$/;"	m	struct:__anon15
+name_bits	speed_seed.c	/^long name_bits[MAX_COLOR \/ BITS_PER_LONG];$/;"	v
+nation_code	dsstypes.h	/^    long            nation_code;$/;"	m	struct:__anon17
+nation_code	dsstypes.h	/^    long            nation_code;$/;"	m	struct:__anon8
+nation_key	dsstypes.h	/^    int             nation_key;$/;"	m	struct:__anon16
+nation_key	dsstypes.h	/^    int             nation_key;$/;"	m	struct:__anon7
+nation_name	dsstypes.h	/^    char            nation_name[C_NATION_NAME_LEN+1];$/;"	m	struct:__anon7
+nation_name	dsstypes.h	/^    char            nation_name[S_NATION_NAME_LEN+1];$/;"	m	struct:__anon16
+nations	dss.h	/^EXTERN distribution nations;$/;"	v
+nations2	dss.h	/^EXTERN distribution nations2;$/;"	v
+nlen	dsstypes.h	/^    int             nlen;$/;"	m	struct:__anon7
+nlen	dsstypes.h	/^    int            nlen;$/;"	m	struct:__anon14
+nlen	dsstypes.h	/^    int            nlen;$/;"	m	struct:__anon15
+nouns	dss.h	/^EXTERN distribution nouns;$/;"	v
+np	dss.h	/^EXTERN distribution np;$/;"	v
+o_priority_set	dss.h	/^EXTERN distribution o_priority_set;$/;"	v
+odate	dsstypes.h	/^    char            odate[DATE_LEN];$/;"	m	struct:__anon11
+odate	dsstypes.h	/^    char            odate[DATE_LEN];$/;"	m	struct:__anon12
+ofp	tpcd.h	29;"	d
+okey	dsstypes.h	/^    DSS_HUGE	    *okey;  \/*for clustering line items*\/$/;"	m	struct:__anon9
+okey	dsstypes.h	/^    DSS_HUGE	    *okey; $/;"	m	struct:__anon10
+okey	dsstypes.h	/^    DSS_HUGE	    *okey;$/;"	m	struct:__anon11
+okey	dsstypes.h	/^    DSS_HUGE	    *okey;$/;"	m	struct:__anon12
+open	bm_utils.c	69;"	d	file:
+opriority	dsstypes.h	/^    char            opriority[MAXAGG_LEN + 1];$/;"	m	struct:__anon11
+opriority	dsstypes.h	/^    char            opriority[MAXAGG_LEN + 1];$/;"	m	struct:__anon12
+opriority	dsstypes.h	/^    char            opriority[MAXAGG_LEN + 1];$/;"	m	struct:__anon9
+optarg	bm_utils.c	/^char *optarg = NULL;$/;"	v
+opterr	bm_utils.c	/^int opterr = 0;$/;"	v
+optind	bm_utils.c	/^int optind = 0;$/;"	v
+order_t	dsstypes.h	/^}               order_t;$/;"	t	typeref:struct:__anon12
+order_t	dsstypes.h	/^}   order_t;$/;"	t	typeref:struct:__anon11
+order_totalprice	dsstypes.h	/^    long           order_totalprice;$/;"	m	struct:__anon9
+orderdate	dsstypes.h	/^    char            orderdate[DATE_LEN];$/;"	m	struct:__anon9
+orderstatus	dsstypes.h	/^    char            orderstatus;$/;"	m	struct:__anon12
+osuff	tpcd.h	/^EXTERN char *osuff;$/;"	v
+p_cntr_set	dss.h	/^EXTERN distribution p_cntr_set;$/;"	v
+p_types_set	dss.h	/^EXTERN distribution p_types_set;$/;"	v
+part_t	dsstypes.h	/^}               part_t;$/;"	t	typeref:struct:__anon14
+part_t	dsstypes.h	/^}               part_t;$/;"	t	typeref:struct:__anon15
+partial	driver.c	/^partial (int tbl, int s)$/;"	f
+partkey	dsstypes.h	/^    long            partkey;$/;"	m	struct:__anon10
+partkey	dsstypes.h	/^    long            partkey;$/;"	m	struct:__anon13
+partkey	dsstypes.h	/^    long            partkey;$/;"	m	struct:__anon9
+partkey	dsstypes.h	/^    long           partkey;$/;"	m	struct:__anon14
+partkey	dsstypes.h	/^    long           partkey;$/;"	m	struct:__anon15
+partsupp_t	dsstypes.h	/^}               partsupp_t;$/;"	t	typeref:struct:__anon13
+permutation	permute.h	/^long permutation[41][22] =$/;"	v
+permute	dss.h	/^   long *permute;$/;"	m	struct:__anon4
+permute	permute.c	/^permute(long *a, int c, long s)$/;"	f
+permute_dist	permute.c	/^permute_dist(distribution *d, long stream)$/;"	f
+phone	dsstypes.h	/^    char            phone[PHONE_LEN + 1];$/;"	m	struct:__anon16
+phone	dsstypes.h	/^    char            phone[PHONE_LEN + 1];$/;"	m	struct:__anon17
+phone	dsstypes.h	/^    char            phone[PHONE_LEN + 1];$/;"	m	struct:__anon7
+phone	dsstypes.h	/^    char            phone[PHONE_LEN + 1];$/;"	m	struct:__anon8
+pick_str	bm_utils.c	/^pick_str(distribution *s, int c, char *target)$/;"	f
+pid_t	config.h	123;"	d
+pids	driver.c	/^int *pids;$/;"	v
+pload	driver.c	/^pload (int tbl)$/;"	f
+pr_cust	print.c	/^pr_cust(customer_t *c, int mode)$/;"	f
+pr_date	print.c	/^int pr_date(date_t *d, int mode){$/;"	f
+pr_drange	print.c	/^pr_drange(int tbl, long min, long cnt, long num)$/;"	f
+pr_line	print.c	/^pr_line(order_t *o, int mode)$/;"	f
+pr_nation	print.c	/^pr_nation(code_t *c, int mode)$/;"	f
+pr_order	print.c	/^pr_order(order_t *o, int mode)$/;"	f
+pr_order_line	print.c	/^pr_order_line(order_t *o, int mode)$/;"	f
+pr_part	print.c	/^pr_part(part_t *part, int mode)$/;"	f
+pr_part_psupp	print.c	/^pr_part_psupp(part_t *part, int mode)$/;"	f
+pr_psupp	print.c	/^pr_psupp(part_t *part, int mode)$/;"	f
+pr_region	print.c	/^pr_region(code_t *c, int mode)$/;"	f
+pr_supp	print.c	/^pr_supp(supplier_t *supp, int mode)$/;"	f
+prep_direct	load_stub.c	/^prep_direct(void)$/;"	f
+prepositions	dss.h	/^EXTERN distribution prepositions;$/;"	v
+print_prep	print.c	/^print_prep(int table, int update)$/;"	f
+process_options	driver.c	/^process_options (int count, char **vector)$/;"	f
+process_options	qgen.c	/^process_options(int cnt, char **args)$/;"	f
+prog	qgen.c	/^char *prog;$/;"	v
+q13a	qgen.c	/^distribution q13a, q13b;$/;"	v
+q13b	qgen.c	/^distribution q13a, q13b;$/;"	v
+qnum	qgen.c	/^int qnum;$/;"	v
+qsub	qgen.c	/^qsub(char *qtag, int flags)$/;"	f
+qty	dsstypes.h	/^    long            qty;$/;"	m	struct:__anon13
+quantity	dsstypes.h	/^    long             quantity;$/;"	m	struct:__anon9
+quantity	dsstypes.h	/^    long            quantity;$/;"	m	struct:__anon10
+rdate	dsstypes.h	/^    char            rdate[DATE_LEN];$/;"	m	struct:__anon10
+read_dist	bm_utils.c	/^read_dist(char *path, char *name, distribution *target)$/;"	f
+refresh	dss.h	/^EXTERN int refresh;$/;"	v
+region_key	dsstypes.h	/^    int             region_key;$/;"	m	struct:__anon16
+region_key	dsstypes.h	/^    int             region_key;$/;"	m	struct:__anon7
+region_name	dsstypes.h	/^    char            region_name[C_REGION_NAME_LEN+1];$/;"	m	struct:__anon7
+region_name	dsstypes.h	/^    char            region_name[S_REGION_NAME_LEN+1];$/;"	m	struct:__anon16
+regions	dss.h	/^EXTERN distribution regions;$/;"	v
+resume	dss.h	/^EXTERN int resume;$/;"	v
+retailprice	dsstypes.h	/^    long           retailprice;$/;"	m	struct:__anon15
+revenue	dsstypes.h	/^    long           revenue;$/;"	m	struct:__anon9
+rflag	dsstypes.h	/^    char            rflag[1];$/;"	m	struct:__anon10
+rndm	qgen.c	/^long rndm;$/;"	v
+row_start	rnd.c	/^row_start(int t)	\\$/;"	f
+row_stop	rnd.c	/^row_stop(int t)	\\$/;"	f
+rowcnt	driver.c	/^long rowcnt = 0, minrow = 0, upd_num = 0;$/;"	v
+rowcnt	tpcd.h	/^int rowcnt;$/;"	v
+rowcnt_dflt	tpcd.h	/^int rowcnt_dflt[QUERIES_PER_SET + 1] = $/;"	v
+rpb_routine	build.c	/^rpb_routine(long p)$/;"	f
+s	dsstypes.h	/^    partsupp_t     s[SUPP_PER_PART];$/;"	m	struct:__anon15
+s_cnt	tpcd.h	/^EXTERN int s_cnt;$/;"	v
+scale	dss.h	/^EXTERN long scale;$/;"	v
+scost	dsstypes.h	/^    long            scost;$/;"	m	struct:__anon13
+sd_cust	speed_seed.c	/^sd_cust(int child, long skip_count)$/;"	f
+sd_line	speed_seed.c	/^sd_line(int child, long skip_count)$/;"	f
+sd_order	speed_seed.c	/^sd_order(int child, long skip_count)        $/;"	f
+sd_part	speed_seed.c	/^sd_part(int child, long skip_count)$/;"	f
+sd_psupp	speed_seed.c	/^sd_psupp(int child, long skip_count)$/;"	f
+sd_supp	speed_seed.c	/^sd_supp(int child, long skip_count)$/;"	f
+sdate	dsstypes.h	/^    char            sdate[DATE_LEN];$/;"	m	struct:__anon10
+season	dss.h	/^} season;$/;"	t	typeref:struct:__anon1
+seasons	build.c	/^season seasons[]={$/;"	v
+seed	permute.c	/^long seed;$/;"	v
+seed_t	dss.h	/^	} seed_t;$/;"	t	typeref:struct:SEED_T
+sellingseason	dsstypes.h	/^   char            sellingseason[D_SEASON_LEN + 1];$/;"	m	struct:__anon18
+set_files	driver.c	/^set_files (int i, int pload)$/;"	f
+set_member	dss.h	/^}         set_member;$/;"	t	typeref:struct:__anon3
+set_seeds	dss.h	/^EXTERN int	set_seeds;$/;"	v
+set_state	bm_utils.c	/^set_state(int table, long sf, long procs, long step, long *extra_rows)$/;"	f
+setup	qgen.c	/^setup(void)$/;"	f
+ship_priority	dsstypes.h	/^    long            ship_priority;$/;"	m	struct:__anon9
+shipinstruct	dsstypes.h	/^    char           shipinstruct[MAXAGG_LEN + 1];$/;"	m	struct:__anon10
+shipmode	dsstypes.h	/^    char            shipmode[O_SHIP_MODE_LEN + 1];$/;"	m	struct:__anon9
+shipmode	dsstypes.h	/^    char           shipmode[MAXAGG_LEN + 1];$/;"	m	struct:__anon10
+size	dsstypes.h	/^    long            size;$/;"	m	struct:__anon14
+size	dsstypes.h	/^    long           size;$/;"	m	struct:__anon15
+sizes	varsub.c	/^long sizes[50] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,$/;"	v
+slen	dsstypes.h	/^   int             slen;$/;"	m	struct:__anon18
+snum	qgen.c	/^int snum = -1;$/;"	v
+spawn_args	driver.c	/^char *spawn_args[25];$/;"	v
+spriority	dsstypes.h	/^    int             spriority;$/;"	m	struct:__anon11
+spriority	dsstypes.h	/^    long            spriority;$/;"	m	struct:__anon12
+start_day	dss.h	/^  int start_day;$/;"	m	struct:__anon1
+start_month	dss.h	/^  int start_month;$/;"	m	struct:__anon1
+stat	bm_utils.c	63;"	d	file:
+step	dss.h	/^EXTERN int  step;$/;"	v
+stop_proc	driver.c	/^stop_proc (int signum)$/;"	f
+stream	tpcd.h	/^EXTERN int stream;$/;"	v
+strip_comments	qgen.c	/^strip_comments(char *line)$/;"	f
+supp_cost	dsstypes.h	/^    long           supp_cost;$/;"	m	struct:__anon9
+suppkey	dsstypes.h	/^    long            suppkey;$/;"	m	struct:__anon10
+suppkey	dsstypes.h	/^    long            suppkey;$/;"	m	struct:__anon13
+suppkey	dsstypes.h	/^    long            suppkey;$/;"	m	struct:__anon16
+suppkey	dsstypes.h	/^    long            suppkey;$/;"	m	struct:__anon17
+suppkey	dsstypes.h	/^    long            suppkey;$/;"	m	struct:__anon9
+supplier_t	dsstypes.h	/^}               supplier_t;$/;"	t	typeref:struct:__anon16
+supplier_t	dsstypes.h	/^}               supplier_t;$/;"	t	typeref:struct:__anon17
+table	dss.h	/^	long table;$/;"	m	struct:SEED_T
+table	dss.h	/^EXTERN long table;$/;"	v
+tax	dsstypes.h	/^    long            tax;$/;"	m	struct:__anon10
+tax	dsstypes.h	/^    long           tax;$/;"	m	struct:__anon9
+tbl_open	bm_utils.c	/^tbl_open(int tbl, char *mode)$/;"	f
+tdef	dss.h	/^}         tdef;$/;"	t	typeref:struct:__anon5
+tdefs	driver.c	/^tdef tdefs[] =$/;"	v
+tdefs	permute.c	/^tdef tdefs = { NULL };$/;"	v
+tdefs	qgen.c	/^tdef tdefs = { NULL };$/;"	v
+tdefs	text.c	/^tdef tdefs = { NULL };$/;"	v
+terminators	dss.h	/^EXTERN distribution terminators;$/;"	v
+text	dss.h	/^   char     *text;$/;"	m	struct:__anon3
+text	dsstypes.h	/^    char            *text;$/;"	m	struct:__anon20
+tfile	tpcd.h	/^EXTERN char *tfile;$/;"	v
+timekey	dsstypes.h	/^    long            timekey;$/;"	m	struct:__anon19
+tlen	dsstypes.h	/^    int            tlen;$/;"	m	struct:__anon14
+tlen	dsstypes.h	/^    int            tlen;$/;"	m	struct:__anon15
+totalprice	dsstypes.h	/^    int             totalprice;$/;"	m	struct:__anon11
+totalprice	dsstypes.h	/^    long            totalprice;$/;"	m	struct:__anon12
+txt_np	text.c	/^txt_np(char *dest, int sd) $/;"	f	file:
+txt_sentence	text.c	/^txt_sentence(char *dest, int sd) $/;"	f	file:
+txt_vp	text.c	/^txt_vp(char *dest, int sd) $/;"	f	file:
+type	dsstypes.h	/^    char           type[P_TYPE_LEN + 1];$/;"	m	struct:__anon15
+type	dsstypes.h	/^    char           type[P_TYPE_MAX + 1];$/;"	m	struct:__anon14
+unjulian	bm_utils.c	/^unjulian(long date)$/;"	f
+upd_num	driver.c	/^long rowcnt = 0, minrow = 0, upd_num = 0;$/;"	v
+updates	dss.h	/^EXTERN long updates;$/;"	v
+usage	driver.c	/^usage (void)$/;"	f
+usage	dss.h	/^	long usage;$/;"	m	struct:SEED_T
+usage	qgen.c	/^usage(void)$/;"	f
+validate	dss.h	/^EXTERN int  validate;$/;"	v
+value	dss.h	/^	long value;$/;"	m	struct:SEED_T
+varsub	varsub.c	/^varsub(int qnum, int vnum, int flags)$/;"	f
+verbose	dss.h	/^EXTERN long verbose;$/;"	v
+verbs	dss.h	/^EXTERN distribution verbs;$/;"	v
+verify	dss.h	/^   int       (*verify) ();$/;"	m	struct:__anon5
+vp	dss.h	/^EXTERN distribution vp;$/;"	v
+vrf_cust	print.c	/^vrf_cust(customer_t *c, int mode)$/;"	f
+vrf_date	print.c	/^int vrf_date(date_t * d, int mode)$/;"	f
+vrf_line	print.c	/^vrf_line(order_t *o, int mode)$/;"	f
+vrf_nation	print.c	/^vrf_nation(code_t *c, int mode)$/;"	f
+vrf_order	print.c	/^vrf_order(order_t *o, int mode)$/;"	f
+vrf_order_line	print.c	/^vrf_order_line(order_t *o, int mode)$/;"	f
+vrf_part	print.c	/^vrf_part(part_t *part, int mode)$/;"	f
+vrf_part_psupp	print.c	/^vrf_part_psupp(part_t *part, int mode)$/;"	f
+vrf_psupp	print.c	/^vrf_psupp(part_t *part, int mode)$/;"	f
+vrf_region	print.c	/^vrf_region(code_t *c, int mode)$/;"	f
+vrf_supp	print.c	/^vrf_supp(supplier_t *supp, int mode)$/;"	f
+vtotal	dss.h	/^   unsigned long vtotal;$/;"	m	struct:__anon5
+week	dsstypes.h	/^    long            week;$/;"	m	struct:__anon19
+weekday_names	build.c	/^char * weekday_names[]={"Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"};$/;"	v
+weekdayfl	dsstypes.h	/^   char            weekdayfl[2];$/;"	m	struct:__anon18
+weeknuminyear	dsstypes.h	/^   int             weeknuminyear;$/;"	m	struct:__anon18
+weight	dss.h	/^   long      weight;$/;"	m	struct:__anon3
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 01\/19\/95<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 02\/21\/95<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 03\/13\/95<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 05\/08\/95<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 09\/01\/95<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 10\/09\/92:<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 10\/11\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 10\/15\/93<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 10\/21\/92<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 10\/25\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 11\/18\/92<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 12\/06\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 12\/19\/95<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 12\/3\/93<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 1\/12\/92<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 1\/23\/96<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 2\/16\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 2\/26\/93<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 3\/16\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 3\/17\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 3\/2\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 4\/25\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 5\/1\/96<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 6\/15\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 7\/26\/93<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 8\/1\/96<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 8\/24\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 8\/31\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 9\/09\/92:<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 9\/09\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 9\/12\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 9\/15\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 9\/21\/94<\/a><ul>$/;"	a
+xxx	history.html	/^<\/ul><li><a name="xxx">Changes as of 9\/27\/94<\/a><ul>$/;"	a
+year	dsstypes.h	/^    long            year;$/;"	m	struct:__anon19
+year	dsstypes.h	/^   int             year;$/;"	m	struct:__anon18
+yearmonth	dsstypes.h	/^   char            yearmonth[D_YEARMONTH_LEN+1];$/;"	m	struct:__anon18
+yearmonthnum	dsstypes.h	/^   int             yearmonthnum;$/;"	m	struct:__anon18
+yes_no	bm_utils.c	/^yes_no(char *prompt)$/;"	f
diff --git a/data/ssb/dbgen/text.c b/data/ssb/dbgen/text.c
new file mode 100644
index 0000000..ef4df3c
--- /dev/null
+++ b/data/ssb/dbgen/text.c
@@ -0,0 +1,313 @@
+/* @(#)text.c	2.1.8.1 */
+/*
+ * text.c --- pseaudo text generator for use in DBGEN 2.0
+ *
+ * Defined Routines:
+ *		dbg_text() -- select and translate a sentance form
+ */
+
+#ifdef TEST
+#define DECLARER
+#endif /* TEST */
+
+#include "config.h"
+#include <stdlib.h>
+#if (defined(_POSIX_)||!defined(WIN32))		/* Change for Windows NT */
+/*#include <unistd.h>
+#include <sys/wait.h>*/
+#endif /* WIN32 */
+#include <stdio.h>				/* */
+#include <limits.h>
+#include <math.h>
+#include <ctype.h>
+#include <signal.h>
+#include <string.h>
+#include <errno.h>
+#ifdef HP
+#include <strings.h>
+#endif
+#if (defined(WIN32)&&!defined(_POSIX_))
+#include <process.h>
+#pragma warning(disable:4201)
+#pragma warning(disable:4214)
+#pragma warning(disable:4514)
+#define WIN32_LEAN_AND_MEAN
+#define NOATOM
+#define NOGDICAPMASKS
+#define NOMETAFILE
+#define NOMINMAX
+#define NOMSG
+#define NOOPENFILE
+#define NORASTEROPS
+#define NOSCROLL
+#define NOSOUND
+#define NOSYSMETRICS
+#define NOTEXTMETRIC
+#define NOWH
+#define NOCOMM
+#define NOKANJI
+#define NOMCX
+#include <windows.h>
+#pragma warning(default:4201)
+#pragma warning(default:4214)
+#endif
+
+#include "dss.h"
+#include "dsstypes.h"
+
+/* 
+ * txt_vp() -- 
+ *		generate a verb phrase by
+ *		1) selecting a verb phrase form
+ *		2) parsing it to select parts of speech
+ *		3) selecting appropriate words
+ *		4) adding punctuation as required
+ *
+ *	Returns: length of generated phrase
+ *	Called By: txt_sentence()
+ *	Calls: pick_str() 
+ */
+static int
+txt_vp(char *dest, int sd) 
+{
+	char syntax[MAX_GRAMMAR_LEN + 1],
+		*cptr,
+		*parse_target;
+	distribution *src;
+	int i,
+		res = 0;
+
+	
+	pick_str(&vp, sd, &syntax[0]);
+	parse_target = syntax;
+	while ((cptr = strtok(parse_target, " ")) != NULL)
+	{
+		src = NULL;
+		switch(*cptr)
+		{
+		case 'D':
+			src = &adverbs;
+			break;
+		case 'V':
+			src = &verbs;
+			break;
+		case 'X': 
+			src = &auxillaries;
+			break;
+		}	/* end of POS switch statement */
+		i = pick_str(src, sd, dest);
+		i = strlen(DIST_MEMBER(src, i));
+		dest += i;
+		res += i;
+		if (*(++cptr))	/* miscelaneous fillagree, like punctuation */
+		{
+			dest += 1;
+			res += 1;
+			*dest = *cptr;
+		}
+		*dest = ' ';
+		dest++;
+		res++;
+		parse_target = NULL;
+	}	/* end of while loop */
+
+	return(res);
+}
+
+/* 
+ * txt_np() -- 
+ *		generate a noun phrase by
+ *		1) selecting a noun phrase form
+ *		2) parsing it to select parts of speech
+ *		3) selecting appropriate words
+ *		4) adding punctuation as required
+ *
+ *	Returns: length of generated phrase
+ *	Called By: txt_sentence()
+ *	Calls: pick_str(), 
+ */
+static int
+txt_np(char *dest, int sd) 
+{
+	char syntax[MAX_GRAMMAR_LEN + 1],
+		*cptr,
+		*parse_target;
+	distribution *src;
+	int i,
+		res = 0;
+
+	
+	pick_str(&np, sd, &syntax[0]);
+	parse_target = syntax;
+	while ((cptr = strtok(parse_target, " ")) != NULL)
+	{
+		src = NULL;
+		switch(*cptr)
+		{
+		case 'A':
+			src = &articles;
+			break;
+		case 'J':
+			src = &adjectives;
+			break;
+		case 'D':
+			src = &adverbs;
+			break;
+		case 'N': 
+			src = &nouns;
+			break;
+		}	/* end of POS switch statement */
+		i = pick_str(src, sd, dest);
+		i = strlen(DIST_MEMBER(src, i));
+		dest += i;
+		res += i;
+		if (*(++cptr))	/* miscelaneous fillagree, like punctuation */
+		{
+			*dest = *cptr;
+			dest += 1;
+			res += 1;
+		}
+		*dest = ' ';
+		dest++;
+		res++;
+		parse_target = NULL;
+	}	/* end of while loop */
+
+	return(res);
+}
+
+/* 
+ * txt_sentence() -- 
+ *		generate a sentence by
+ *		1) selecting a sentence form
+ *		2) parsing it to select parts of speech or phrase types
+ *		3) selecting appropriate words
+ *		4) adding punctuation as required
+ *
+ *	Returns: length of generated sentence
+ *	Called By: dbg_text()
+ *	Calls: pick_str(), txt_np(), txt_vp() 
+ */
+static int
+txt_sentence(char *dest, int sd) 
+{
+	char syntax[MAX_GRAMMAR_LEN + 1],
+		*cptr;
+	int i,
+		res = 0,
+		len = 0;
+
+	
+	pick_str(&grammar, sd, syntax);
+	cptr = syntax;
+
+next_token:	/* I hate goto's, but can't seem to have parent and child use strtok() */
+	while (*cptr && *cptr == ' ')
+		cptr++;
+	if (*cptr == '\0')
+		goto done;
+	switch(*cptr)
+		{
+		case 'V':
+			len = txt_vp(dest, sd);
+			break;
+		case 'N': 
+			len = txt_np(dest, sd);
+			break;
+		case 'P':
+			i = pick_str(&prepositions, sd, dest);
+			len = strlen(DIST_MEMBER(&prepositions, i));
+			strcpy((dest + len), " the ");
+			len += 5;
+			len += txt_np(dest + len, sd);
+			break;
+		case 'T':
+			i = pick_str(&terminators, sd, --dest); /*terminators should abut previous word */
+			len = strlen(DIST_MEMBER(&terminators, i));
+			break;
+		}	/* end of POS switch statement */
+		dest += len;
+		res += len;
+		cptr++;
+		if (*cptr && *cptr != ' ')	/* miscelaneous fillagree, like punctuation */
+		{
+			dest += 1;
+			res += 1;
+			*dest = *cptr;
+		}
+		goto next_token;
+done:
+	*dest = '\0';
+	return(--res);
+}
+
+/*
+ * dbg_text() -- 
+ *		produce ELIZA-like text of random, bounded length, truncating the last 
+ *		generated sentence as required
+ */
+int
+dbg_text(char *tgt, int min, int max, int sd)
+{
+	long length = 0; 
+	int wordlen = 0,
+		needed,
+		s_len;
+	char sentence[MAX_SENT_LEN + 1];
+	
+	RANDOM(length, min, max, sd);
+
+	while (wordlen < length)
+	{
+		s_len = txt_sentence(sentence, sd);
+		if ( s_len < 0)
+			INTERNAL_ERROR("Bad sentence formation");
+		needed = length - wordlen;
+		if (needed >= s_len + 1)	/* need the entire sentence */
+		{
+			strcpy(tgt, sentence);
+			tgt += s_len;
+			wordlen += s_len + 1;
+			*(tgt++) = ' ';
+		}
+		else /* chop the new sentence off to match the length target */
+		{
+			sentence[needed] = '\0';
+			strcpy(tgt, sentence);
+			wordlen += needed;
+			tgt += needed;
+		}
+	}
+	*tgt = '\0';
+
+	return(wordlen);
+}
+
+#ifdef TEST
+tdef tdefs = { NULL };
+
+main()
+{
+	char prattle[401];
+	
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns", &nouns);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs", &verbs);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives", &adjectives);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs", &adverbs);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries", &auxillaries);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators", &terminators);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "articles", &articles);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions", &prepositions);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar", &grammar);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "np", &np);
+	read_dist (env_config (DIST_TAG, DIST_DFLT), "vp", &vp);
+
+	while (1)
+	{
+		dbg_text(&prattle[0], 300, 400, 0);
+		printf("<%s>\n", prattle);
+	}
+
+	return(0);
+}
+#endif /* TEST */
diff --git a/data/ssb/dbgen/text.o b/data/ssb/dbgen/text.o
new file mode 100644
index 0000000000000000000000000000000000000000..d9e94856382f27633fc2edf69a31a7017c0d9ba3
GIT binary patch
literal 4312
zcmbtXU2GIp6u#5lO66y#qCk<N8#9^`?WT*Uh%4!|({dMvra_7o!DhNUr3>5Lb#@Bn
zClIoQbg~W*eIfBd;uC7(gHa+HwIFSSK1kvVfdmW>*jgenBue;O&zYG^FPn}rdXl?y
z?>FB$_uO;NJ)Q1rQClknfiMxsN;0)#C?TD5rsj>#w2_3!93sCiZ;(6W^&PL#!nNgb
z>yBkgvy-$Ke7X);)Ka9n=rFZ6NsFj;f!eR4>Nx($-{3E`E~_r6+Akn5xj5GLHBRE9
za%r+{v?8q9-GXBG3g010U6x~kS|jwpu$dK4v=G(09b0Wax$iW!7b7nwn{#7|xA@Z0
z6kE=#)|Dc-2b;PEj?z=V1ZkkTiWUc|Tqto)6|N}4xOif%FnCrR++8Y_hGx>jX`wJI
zO!UEu7U4`u`mB4x{<(@ZP77z2zqwi7ByX1AlDoclEZq>FxWxr$|FvESE4B7Wb(SjC
za1k1mI^hijDRGu2PSeCaYW+a1QSXR~6ox~Jht5<GGNwqic$7(99-Tpoo1|c2QV@qa
z;T8B4R*HM1T4l-2m(PobdH|!gDYdI`;5ya%%RAjgRk#7%pe&3j;7cvFn*}FHqk%9@
zfPMN3SQ72cj{bV-MNqHd)24#am)zWeJ3(|uwf9Kv6P0j?*aq>yPk>N6Ej3f?H>H@A
zY83100|u<yEcMq_D3<CL>*{1R<eAzVr_XcTI)iyOW${E%qINy*8QWkkX46hgwOdVg
zCBCGhG7i<ISbtLMx_fXh(=88VnhF;n(_mJWoLujK>_V1@RtwZ#CrtFh62n&=PmIJN
zUP4+57F8*T1yCiPAjdD8V&Rkk=?uz4tAw@oJ|SKh74T41V6?Rq3!A?W4vT|r_e!PM
zdUKa}_#)2Q%uTWO{*P7b7NoGoy2M9^dXBp1U{x81N<XR!H^)fW?A3e58iQdX7e*@M
ziMy&$vM!Bpzf&s7!pOura2YeETBY%B#H^8tm4L+y*TJt_<;nDRbMF)lv`$3_6)QoJ
z7M*+l)qDu+x7ZhWXT6#_KdRKWvucC-#?3JeSd(!7YvaOY73+TD{kx?St6}FBje@<R
zTmMrRYCh=I(*0-fzLa08Flo|~;k=$P^-Mw!_hgMe%}ix8gsf_AUJ<VA=<d&${o!ad
zvMkb2AMJOR(Y;HTMjDna@nCd~5Ukv47#QePZ5;&aeHgI-43x**2N8CyBf>znuxR#-
zsw3bL1>CuW@LY%iX$n;zu4oF?911jr!gjDJRQFD$9BMfDkQ`cGn4yGn6}M-GmIES(
z>VN@E&}a%($+Jj1sQ1D^>A;O!|IyZ}(6K<EA>e1tG2{D#t_H~;rRfNPX*zxt5Gy#`
zIV)hIW56lT2TZx~s>C=xAhThGaW0pI08EEJ<%8p`pN{{$55B?&Z}!30_~7e&@C4v<
zAwDcFPW}K(`|wR4e76r?@WESr@Pj`1yFT~_KKLg-_-8)&Ngte;`hXcp5Ocun%H&8y
zZEnDyD9NP~+q&|mLA2ynJz=J{>v@vN_Gj`0Kr_3IXoi_eqyg2E+jXPc!P9z%bb*!z
zx{2OxT|HVVO_JSPx^NfLkxBK$GbS2H<aUxRhSt}o8Dx8oX#E4JbXqe~U@m9qxokd#
z?II7}8GWgYW@ZhvpVSS5^z1NFrcSi(tO3btfODZ>JwF5knyDksM+&U5SIppW%nJ(g
z@$N9biXaH$55r)%fglLtXp7-H5ClPNCJctZhad>@XTf0jF_@zuj(3gW&NmON%l-Tj
zFcieGwKM)aY!D!bhhQ*#BZuStWB5lL{s@Qv>fz|mTn@j+@%gybArS;0kHTQ~>pi^O
z|BW0U-!Xm{$LHgg=lIy87=JH^&*ShT9L~q{L(V?${}&w2$FmjtIS9VSKb~ygH6C7m
zuFf((zDLnGK5zeXj=unS%>GG^&-*#*!~dD%^Y*WB{2I>w4Gzb@0&L%#oE_}#48P0a
zk8}6}?|}H=>t`#6^K}>o9ty_O3%Rq#ev86z){jXfzq8NOy5R|J#d-F!nW3k(h|@VD
zNzK$qq&uG{5hIHWq(>uB5=mtH`rtc*L^4@ZkH9BFz0-cY^_EP3BxhuEx?%1FRWzvg
zcJ)B#(OrN3?{63k;FZDN!%L&L$gl$O*538HPhA8vq%__~+(`{KA&?_pUa*&-F7_-G
z%r(k0Fc9}>2s!d;*}e=f-~Tyyfj0lEeiU@k=295Eu>G;dcwzgqXCu7CIqTz$JLdV1
z{_(=}*fS0<*_`Ly_?PE@1C;Ro@wJL}ZhHD*TyVnT&!D$qg8P^2*WjfCVg21*P34cR
z@m}CojSTRf$7j0}78lGvmIT|M<$t=pQS+R)|B&Yxn!_3`*GHS>xd(`JdPL{Hc?3Ku
Uk2Us$>GaQg{^NHOFXj4w1F)xQsQ>@~

literal 0
HcmV?d00001

diff --git a/data/ssb/dbgen/tpcd.h b/data/ssb/dbgen/tpcd.h
new file mode 100644
index 0000000..31b5f98
--- /dev/null
+++ b/data/ssb/dbgen/tpcd.h
@@ -0,0 +1,103 @@
+/*****************************************************************
+ *  Title: tpcd.h for TPC D
+ *  Sccsid: @(#)tpcd.h	2.1.8.1 
+ *  Description:
+ *  X
+ *
+ *****************************************************************
+ */
+#define DFLT            0x0001
+#define OUTPUT          0x0002
+#define EXPLAIN         0x0004
+#define DBASE           0x0008
+#define VERBOSE         0x0010
+#define TIMING          0x0020
+#define LOG             0x0040
+#define QUERY           0x0080
+#define REFRESH         0x0100
+#define ANSI            0x0200
+#define SEED            0x0400
+#define COMMENT         0x0800
+#define INIT            0x1000
+#define TERMINATE       0x2000
+#define DFLT_NUM        0x4000
+
+/*
+ * general defines
+ */
+#define VTAG            ':'          /* flags a variable substitution */
+#define ofp             stdout       /* make the routine a filter */
+#define QDIR_TAG        "DSS_QUERY"  /* variable to point to queries */
+#define QDIR_DFLT       "."          /* and its default */
+
+/*
+ * database portability defines
+ */
+#ifdef DB2
+#define GEN_QUERY_PLAN  "SET CURRENT EXPLAIN SNAPSHOT ON;"
+#define START_TRAN      ""
+#define END_TRAN        "COMMIT WORK;"
+#define SET_OUTPUT      ""
+#define SET_ROWCOUNT    "--#SET ROWS_FETCH %d\n"
+#define SET_DBASE       "CONNECT TO %s ;\n"
+#endif
+
+#ifdef INFORMIX
+#define GEN_QUERY_PLAN  "SET EXPLAIN ON;"
+#define START_TRAN      "BEGIN WORK;"
+#define END_TRAN        "COMMIT WORK;"
+#define SET_OUTPUT      "OUTPUT TO "
+#define SET_ROWCOUNT    "FIRST %d"
+#define SET_DBASE       "database %s ;\n"
+#endif
+
+#ifdef 	SQLSERVER
+#define GEN_QUERY_PLAN  "set showplan on\nset noexec on\ngo\n"
+#define START_TRAN      "begin transaction\ngo\n"
+#define END_TRAN        "commit transaction\ngo\n"
+#define SET_OUTPUT      ""
+#define SET_ROWCOUNT    "set rowcount %d\ngo\n\n"
+#define SET_DBASE       "use %s\ngo\n"
+#endif
+
+#ifdef 	SYBASE
+#define GEN_QUERY_PLAN  "set showplan on\nset noexec on\ngo\n"
+#define START_TRAN      "begin transaction\ngo\n"
+#define END_TRAN        "commit transaction\ngo\n"
+#define SET_OUTPUT      ""
+#define SET_ROWCOUNT    "set rowcount %d\ngo\n\n"
+#define SET_DBASE       "use %s\ngo\n"
+#endif
+
+#ifdef TDAT
+#define GEN_QUERY_PLAN  "EXPLAIN"
+#define START_TRAN      "BEGIN TRANSACTION"
+#define END_TRAN        "END TRANSACTION"
+#define SET_OUTPUT      ".SET FORMAT OFF\n.EXPORT REPORT file="
+#define SET_ROWCOUNT    ".SET RETCANCEL ON\n.SET RETLIMIT %d\n"
+#define SET_DBASE       ".LOGON %s\n"
+#endif
+
+#define MAX_VARS      8 /* max number of host vars in any query */
+#define QLEN_MAX   2048 /* max length of any query */
+#define QUERIES_PER_SET 22
+#define MAX_PIDS 50
+
+EXTERN int flags;
+EXTERN int s_cnt;
+EXTERN char *osuff;
+EXTERN int stream;
+EXTERN char *lfile;
+EXTERN char *ifile;
+EXTERN char *tfile;
+
+#define MAX_PERMUTE     41
+#ifdef DECLARER
+int rowcnt_dflt[QUERIES_PER_SET + 1] = 
+    {-1,-1,100,10,-1,-1,-1,-1,-1,-1,20,-1,-1,-1,-1,-1,-1,-1,100,-1,-1,100,-1};
+int rowcnt;
+#define SEQUENCE(stream, query) permutation[stream % MAX_PERMUTE][query - 1]
+#else
+extern int rowcnt_dflt[];
+extern int rowcnt;
+#endif
diff --git a/data/ssb/dbgen/varsub.c b/data/ssb/dbgen/varsub.c
new file mode 100644
index 0000000..36adf91
--- /dev/null
+++ b/data/ssb/dbgen/varsub.c
@@ -0,0 +1,314 @@
+/* Sccsid:     @(#)varsub.c	2.1.8.3 */
+#include <stdio.h>
+#ifndef _POSIX_SOURCE
+#include <malloc.h>
+#endif /* POSIX_SOURCE */
+#if (defined(_POSIX_)||!defined(WIN32))
+#ifndef DOS
+#include <unistd.h>
+#endif
+#endif /* WIN32 */
+#include <string.h>
+#include "config.h"
+#include "dss.h"
+#include "tpcd.h"
+#ifdef ADHOC
+#include "adhoc.h"
+extern adhoc_t adhocs[];
+#endif /* ADHOC */
+
+#define MAX_PARAM	10		/* maximum number of parameter substitutions in a query */
+
+extern long Seed[];
+extern char **asc_date;
+extern double flt_scale;
+extern distribution q13a, q13b;
+long *permute(long *set, int cnt, long stream);
+
+long brands[25] = {11,12,13,14,15,21,22,23,24,25,31,32,33,34,35,
+			41,42,43,44,45,51,52,53,54,55};
+long sizes[50] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
+				21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
+				41,42,43,44,45,46,47,48,49,50};
+long ccode[25] = {10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34};
+char *defaults[24][11] =
+{
+    {"90",              NULL,                   NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 1 */
+    {"15",              "BRASS",                "EUROPE",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 2 */
+    {"BUILDING",        "1995-03-15",           NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 3 */
+    {"1993-07-01",      NULL,                   NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 4 */
+    {"ASIA",            "1994-01-01",           NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 5 */
+    {"1994-01-01",      ".06",                  "24",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 6 */
+    {"FRANCE",          "GERMANY",              NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 7 */
+    {"BRAZIL",          "AMERICA",      "ECONOMY ANODIZED STEEL",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},/* 8 */
+    {"green",         NULL,                   NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 9 */
+    {"1993-10-01",      NULL,                   NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 10 */
+    {"GERMANY",         "0.0001",                 NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 11 */
+    {"MAIL",            "SHIP",                 "1994-01-01",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 12 */
+    {"special", "requests",                   NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 13 */
+    {"1995-09-01",      NULL,                   NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 14 */
+    {"1996-01-01",      NULL,                   NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 15 */
+    {"Brand#45",        "MEDIUM POLISHED", "49",
+	"14","23","45","19","3","36","9", NULL}, /* 16 */
+    {"Brand#23",        "MED BOX",               NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 17 */
+    {"300", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 18 */
+    {"Brand#12", "Brand#23", "Brand#34", "1", "10", "20", NULL, NULL, NULL, NULL, NULL}, /* 19 */
+    {"forest", "1994-01-01", "CANADA", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 20 */
+    {"SAUDI ARABIA", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* 21 */
+    {"13","31","23", "29", "30", "18", "17", NULL, NULL, NULL, NULL},  /* 22 */
+    {NULL,NULL,NULL,NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* UF1 */
+    {NULL,NULL,NULL,NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},  /* UF2 */
+};
+void
+varsub(int qnum, int vnum, int flags)
+{
+    static char param[11][128];
+    static FILE *lfp = NULL;
+	long *lptr;
+    char *ptr;
+    int i = 0,
+        tmp_date;
+	long tmp1,
+		tmp2;
+	
+    if (vnum == 0)
+    {
+		if ((flags & DFLT) == 0)
+		{
+			switch(qnum)
+			{
+			case 1:
+				sprintf(param[1], "%d", UnifInt((long)60,(long)120,(long)qnum));
+				param[2][0] = '\0';
+				break;
+			case 2:
+				sprintf(param[1], "%d", 
+					UnifInt((long)P_SIZE_MIN, (long)P_SIZE_MAX, qnum));
+				pick_str(&p_types_set, qnum, param[3]);
+				ptr = param[3] + strlen(param[3]);
+				while (*(ptr - 1) != ' ') ptr--;
+				strcpy(param[2], ptr);
+				pick_str(&regions, qnum, param[3]);
+				param[4][0] = '\0';
+				break;
+			case 3:
+				pick_str(&c_mseg_set, qnum, param[1]);
+				/*
+				* pick a random offset within the month of march and add the
+				* appropriate magic numbers to position the output functions 
+				* at the start of March '95
+				*/
+				tmp_date = UnifInt((long)0, (long)30, (long)qnum);
+				strcpy(param[2], *(asc_date + tmp_date + 1155));
+				param[3][0] = '\0';
+				break;
+			case 4:
+				tmp_date = UnifInt(1,58,qnum);
+				sprintf(param[1],"19%02d-%02d-01", 
+					93 + tmp_date/12, tmp_date%12 + 1);
+				param[2][0] = '\0';
+				break;
+			case 5:
+				pick_str(&regions, qnum, param[1]);
+				tmp_date = UnifInt((long)93,(long)97,(long)qnum);
+				sprintf(param[2], "19%d-01-01", tmp_date);
+				param[3][0] = '\0';
+				break;
+			case 6:
+				tmp_date = UnifInt(93,97,qnum);
+                sprintf(param[1], "19%d-01-01", tmp_date);
+				sprintf(param[2], "0.0%d", UnifInt(2, 9, qnum));
+				sprintf(param[3], "%d", UnifInt((long)24, (long)25, (long)qnum));
+				param[4][0] = '\0';
+				break;
+			case 7:
+				tmp_date = pick_str(&nations2, qnum, param[1]);
+				while (pick_str(&nations2, qnum, param[2]) == tmp_date);
+				param[3][0] = '\0';
+				break;
+			case 8:
+				tmp_date = pick_str(&nations2, qnum, param[1]);
+				tmp_date = nations.list[tmp_date].weight;
+				strcpy(param[2], regions.list[tmp_date].text);
+				pick_str(&p_types_set, qnum, param[3]);
+				param[4][0] = '\0';
+				break;
+			case 9:
+				pick_str(&colors, qnum, param[1]);
+				param[2][0] = '\0';
+				break;
+			case 10:
+				tmp_date = UnifInt(1,24,qnum);
+				sprintf(param[1],"19%02d-%02d-01", 
+					93 + tmp_date/12, tmp_date%12 + 1);
+				param[2][0] = '\0';
+				break;
+			case 11:
+				pick_str(&nations2, qnum, param[1]);
+				sprintf(param[2], "%11.10f", Q11_FRACTION / flt_scale );
+				param[3][0] = '\0';
+				break;
+			case 12:
+				tmp_date = pick_str(&l_smode_set, qnum, param[1]);
+				while (tmp_date == pick_str(&l_smode_set, qnum, param[2]));
+				tmp_date = UnifInt(93,97,qnum);
+				sprintf(param[3], "19%d-01-01", tmp_date);
+				param[4][0] = '\0';
+				break;
+			case 13:
+				pick_str(&q13a, qnum, param[1]);
+				pick_str(&q13b, qnum, param[2]);
+				param[3][0] = '\0';
+				break;
+			case 14:
+				tmp_date = UnifInt(1,60,qnum);
+				sprintf(param[1],"19%02d-%02d-01", 
+					93 + tmp_date/12, tmp_date%12 + 1);
+				param[2][0] = '\0';
+				break;
+			case 15:
+				tmp_date = UnifInt(1,58,qnum);
+				sprintf(param[1],"19%02d-%02d-01", 
+					93 + tmp_date/12, tmp_date%12 + 1);
+				param[2][0] = '\0';
+				break;
+			case 16:
+				tmp1 = UnifInt(1, 5, qnum); 
+				tmp2 = UnifInt(1, 5, qnum);
+				sprintf(param[1], "Brand#%d%d", tmp1, tmp2);
+				pick_str(&p_types_set, qnum, param[2]);
+				ptr = param[2] + strlen(param[2]);
+				while (*(--ptr) != ' ');
+				*ptr = '\0';
+				lptr = &sizes[0];
+				for (i=3; i <= MAX_PARAM; i++)
+				{
+					sprintf(param[i], "%ld", *permute(lptr,50,qnum) + 1);
+					lptr = (long *)NULL;
+				}
+					break;
+			case 17:
+				tmp1 = UnifInt(1, 5, qnum); 
+				tmp2 = UnifInt(1, 5, qnum);
+				sprintf(param[1], "Brand#%d%d", tmp1, tmp2);
+				pick_str(&p_cntr_set, qnum, param[2]);
+				param[3][0] = '\0';
+				break;
+			case 18:
+				sprintf(param[1], "%ld", UnifInt(312, 315, qnum));
+				param[2][0] = '\0';
+				break;
+			case 19:
+				tmp1 = UnifInt(1, 5, qnum); 
+				tmp2 = UnifInt(1, 5, qnum);
+				sprintf(param[1], "Brand#%d%d", tmp1, tmp2);
+				tmp1 = UnifInt(1, 5, qnum); 
+				tmp2 = UnifInt(1, 5, qnum);
+				sprintf(param[2], "Brand#%d%d", tmp1, tmp2);
+				tmp1 = UnifInt(1, 5, qnum); 
+				tmp2 = UnifInt(1, 5, qnum);
+				sprintf(param[3], "Brand#%d%d", tmp1, tmp2);
+				sprintf(param[4], "%ld", UnifInt(1, 10, qnum));
+				sprintf(param[5], "%ld", UnifInt(10, 20, qnum));
+				sprintf(param[6], "%ld", UnifInt(20, 30, qnum));
+				param[7][0] = '\0';
+				break;
+			case 20:
+				pick_str(&colors, qnum, param[1]);
+				tmp_date = UnifInt(93,97,qnum);
+				sprintf(param[2], "19%d-01-01", tmp_date);
+				pick_str(&nations2, qnum, param[3]);
+				param[4][0] = '\0';
+				break;
+			case 21:
+				pick_str(&nations2, qnum, param[1]);
+				param[2][0] = '\0';
+				break;
+			case 22:
+				lptr = &ccode[0];
+				for (i=0; i <= 7; i++)
+				{
+					sprintf(param[i+1], "%ld", 10 + *permute(lptr,25, qnum));
+					lptr = (long *)NULL;
+				}
+				param[8][0] = '\0';
+				break;
+			case 23:
+			case 24:
+                break;
+			default:
+				fprintf(stderr, 
+					"No variable definitions available for query %d\n", 
+                    qnum);
+				return;
+        }
+    }
+	
+    if (flags & LOG)
+	{
+        if (lfp == NULL)
+		{
+            lfp = fopen(lfile, "a");
+            OPEN_CHECK(lfp, lfile);
+		}
+        fprintf(lfp, "%d", qnum);
+        for (i=1; i <= 10; i++)
+            if (flags & DFLT)
+			{
+				if (defaults[qnum - 1][i - 1] == NULL)
+					break;
+				else
+					fprintf(lfp, "\t%s", defaults[qnum - 1][i - 1]);
+			}
+            else
+			{
+				if (param[i][0] == '\0')
+					break;
+				else
+					fprintf(lfp, "\t%s", param[i]);
+			}
+			fprintf(lfp, "\n");
+	}
+    }
+    else
+	{
+        if (flags & DFLT)   
+		{
+            /* to allow -d to work at all scale factors */
+            if (qnum == 11 && vnum == 2)
+                fprintf(ofp, "%11.10f", Q11_FRACTION/flt_scale);
+            else
+                if (defaults[qnum - 1][vnum - 1])
+                    fprintf(ofp, "%s", defaults[qnum - 1][vnum - 1]);
+                else
+					fprintf(stderr, 
+					"Bad default request (q: %d, p: %d)\n",
+					qnum, vnum);
+		}
+        else        
+		{
+            if (param[vnum] && vnum <= MAX_PARAM)
+                fprintf(ofp, "%s", param[vnum]);
+            else
+				fprintf(stderr, "Bad parameter request (q: %d, p: %d)\n",
+				qnum, vnum);
+		}
+	}
+    return;
+}
diff --git a/data/ssb/dbgen/varsub.o b/data/ssb/dbgen/varsub.o
new file mode 100644
index 0000000000000000000000000000000000000000..f5374552eb2ad0b4d37cf2366ad49e15209d78b7
GIT binary patch
literal 17936
zcmeI34{%lWmB)Vp0*Gu9t0<LJo`hJS=K1mmOwejxNZ@9CAv7U~77UM<7m_t3;k}nO
zxT9tpqNdrjPCJg&TG!d_xE5vY?$nCeZUkB=b?YvxZAW*vyR>CjBI{P!DO4<E&-dPQ
zUJk#!3+(JTJJaC|xxf4S-19rX^E<yk_x|pEJXjN1IVCI0h-8`D%tTJ8F?r#M^Gf%$
z(p+V(H2<A@-Rbb1;dSAr@Y<$Er~f}^4xSl&W8h@&!1I?I6CV6Yxc_KY&xZc-xw#KN
zGs~F4-#Wu{P~M46`+}2o__pyQBdBk*mkzJoN{3_gXKDxcI>Uc}vk{X`SE1qze;Qy9
z4zAKcx<YdD`<1ynZlc<pepkWd?%0Hy#<(W0@Q_v58U6$c_rIIv#NKs=t9O(AwKa5b
zjXJ|O9*zt@hq@!fOQAUSRkArUh0aDx=<o_Wx_DP3!$qWTB7^UY!UQ+PEgu>DwP$BI
zm2LVS7!RaoIsN~Yg|nWMBYo4()s{BdRyY=8zdk!YE;>`KnnnOducN~&^yhi+>~w~_
z{uuhs{<#aOiFS=*`7yFQLf)9s<*<-$zY{y`w7>q`u3bx<eW$M<+_&$8`HNp2p53tT
z7gH7=svdT-s)wsKUUX-U1yuvk6*^QS0HaY<zmX1Z*uoPd1)iUGq>TOJ8EV7y^BVCV
zZeGF5P&=Y;cfNn^Ucrx};M@!a%>~`QmfV(3ok13KOu~Z)BHqN{ff^Y+Jqo4B;3?O|
z>WX8j<I;(Te>Y)ire%qS!oDNZoT=Ufh#8$5>Yh1iV2DSy9jU8D)rF{OuSKVlbVg`m
znNdXMBO6q2eK^r<S}4wO<wS0t+u{?s_joPmb!Sq4rboOu-><CBwyb9C7AT$5Eh8ZQ
z1Ny=v!zdT9T<RE0|JYpI|G{aBeGVMJvip(6ej}>MMKvGkx^eOYswNNBpcf}vQ#*Jd
zGu~G>c<jQ<l3KW7T*p@y?nu@#r~g=1UwNSC)Ir=Y+@-#5_;1mDT!81M=Lzl~?$nG7
zncDunS&_l_ogeNWCvjIVvO95)tL=Zc&t0wSD$exm9m_*2ugKiWjQHYODMJM&bQ5K$
zeC@YX?Z;5<Ei?;WDW`+mfd9>HdTcF>T$$PTj2OPB<3Dzu2Z7914`#+>^}Wd8n7f>1
zzFTB2cL8($bJa74MNW^(y{ct>bYniBD_5Z`3`o%7zqvpm_dbu3x=<h8xiZ%zUHz9Q
zCzeW-*27^0z<;@#v|^IBSG$Za+n`ihp(@4Gh^|z0=8(>L=hoG54tH5Kw`N`Z+>&~B
z|7Lvr0?&?P`&=L0=R(JOuypHm2E9ATz~0<}`^ilB<=He-BK$}>goT)z=00wDouJB2
zjE%lqOsc^dd@C||LTgQ|2~Pj<KBxb{%-V{drxw=?98JCE4AmKD;76%>zGaTT>I}L5
zPK+a`{|7TayD_{WyfJ)FxcU1h_h;qqc$_@!AHTdOyZ`KzRN3&laVPfraLqV98Vt|w
zbB3om^vKRFaE5N(?+lEmD%r3bf#bU--%cmBcMk={>!h(>TJ*w$?_DU~pCPsWv$Gy7
zaAIejeQ(c5Kc(L~L)SjGn9Y5VA<7f+8=FFE=nE3eDhQqmAh;nz+`gg|_eThBU=Z8{
zAXq6OSQH^xz#tgm5Vz5kVo*Y^@e+f_90<lK<XSH=^El1t)ZLuWIc70rzL%H)r`L13
zfK$H&p@!gz6@o1Q1P}5McfeALhhhkx93eBk#NfdV^7|5}0<Sq{A*Y3$-oWW6IbFo*
zVosNE`YBFt<kY>_(|M5dH*p%`w3t)e^E1iCj=PH;PZt^|(*BEWS1CJP#;Lo8(D^dX
z-^{6BQrKdOD+)s;Ek*7jRE%s(QK*>ynNTo9*9(h_gT<j%Q<aEzwiFb$0ClvOsQFwp
zk?h$VjG6lGcxRwB+TIaw3AA=40)@#yG)2edg)OtpWrayo5i-R~O;tm<vC-5tHPo-E
zF;z{qk?PvjtEj$;rA49AB2u8!(xTAKs5;zO8%Dm2T?vMknUXTIvLU>BMU7ci(@+;)
zeHSd=T^ljsx|)XC6=74eqJDLK-Ccq3>iX*1yKAZgjcaRaB4$e>9`E#w7l*iAA*!P;
zOl2FL+BGKG9gnp~J4_<JttXyLCE2M8D3cS**a<HJWlK$6O?7QkU0_Xpq_)wisWxR5
zrnt<Mlp4BLTwzL0=`vHn0V7K4fvWoTrZf~{SX{y<rDcYy2$_<Qp~z9)W<_{)xH@ba
z!%fw-fp9~(3V|St#ioQTQy|Mt@y%v+SAfFb9^Kp#546Nv+dJD+?OmM|_s>x`xD*{h
ztx0STpcks5Ezpbhbff~@%D|#+%jw#Ufo>#=y%hv{-O)sJYdjTCoLAFtL|x*SB5)}J
zmm+W}0+%9iDFT-w@c$cuq80haf4S`1+s&#KE0zZqHEr(cO!WlliMA{lDk|=Av&9dT
zlmtU%iy6BPWB-M{B=%pcRySdPwiitHXR?pXy05`x_2y;Gn>}OZqqLcekdEvTU&dy2
zWlr9gr&Q(4{nFH`oWSt3s+>iCk{!+oJv2R>Q{F$LI;V2#?#pN8loJZ)EFu+Bt;)%(
z%9$CKtp^#IEZ8?rGw;hb`5c9*8NU$1ciB~*qfT6QnKqZ*<^;!6Viws6FzVx?*k^-_
z$o|yZPfxb*?Vq#FW+}j_xBtmb$5}3W`@QV+eqOk@zsokV9~bWJ=O%~H+Ye<M*=LD9
zUa_OMW&bEV%v|<U!dJQ{vMU#A%pI(ccj;QzxS4F@)iZHr9rKzA<eqJCkJF9pQ{}Q3
zujyvJ08ZN|{g~=^ckB`CqCbpdTRu-96Cd#5UnPD;`fY<TmE6Hk`shFF!~e>M<Lyr-
ze-8NY*L?V!KKvIx{2d>D!iRt0!>7`O%;bNr55LNXf1LOgH2RNFJrJxZkoi9PpY-7+
zY{x5Y%ppF(r-4j<-tNO|eE1zce2ouZ=fgMp@ECE#0WZWM;{U`;2X~hmEH)jj-NBFv
zMpn27+E~+RF%IroDbC7pSc=0k9B#&;9ES=VX!nhyTZT5;I4|RBOjBojYi(!BG&d)^
z6YZU;*5+8-y_9QJIo?($O?Pu@dv`q9oQ$VTcYEyK=42{i=&ys`K9GS_BG$d#EuV;Q
z!DijWnztt7TU`BUGS=J@O~p-T)ZNIJ$cc${b#x_?rnMv0oQy>~;-;fHxwWe$?iSlt
zTpBfW*ld#R_r=MDcw%c0S?O+$b*2)oN-Rdo<VH(8k)UAFOB#C9Krd`eyth4NTDyBv
zF<kBH@iIxX8Si<LUf7MiPeM0f6{qjPt~By>toT^th2y(F1o~Jagm(gxfa4yDbpz7t
zCGMRA?_5Ry0p_AVkH!P!%U+VUL*G|j`BTMpz0WC*dd2>W%;7)Y@e2PQbE#MSd_!@(
z3l{x<R~+wRg`Z_^$7d$D!;TN;J;biJfH~~w_$;yYIbWf8K9v!FZc}z}uN1ygalFeG
zzD3zV-10f!q4e=CQ}mOT{*#=4Q1O7$-=XYif4->nQSUqigyeCHOTYZN;@6X&`134t
z#7*1(p5l7k9#D1`C_8T{uH*2p%8ri1QA_{poIk7db-hcteMs<ZCH~){IO>)5)-bon
z!yT5s%$M~R7dtJA2UNZ7%D#@<{fcY<A69nov?c!UR9wgH&y*cKo}ab!C2sqbzOMJj
zN?+IebH#Ps-d1*Wy&o#B?PsGf2`=L?UfOlF;<{Y{=61i7DX#6`rtIkTs$OxN$~?MD
z+0p(tTlz9zIxH^z-m7?lYS;bD?fLSs(#NT+17A|SK-qbUxoziLN*|}vzu&dE__<&4
z0%iY)%x(KWQThu>AJ-wjR6Ia=;U^TwskG}9bLi=Leb&;KewoDvF;4R7PsT|ub0m5m
z%~AR=DgCv;;u43T;)Tk7DRU&+eudI^?en_RsPy$Xd{F7@_2*H=VMqLVLfO&d@M%k5
z)}P%L7dtO2UZC1Fs_g4_9aQ@G_A7Dzd&TjMQ20+3$2Vf(|DibACH?*$b0oT5CoO#$
zx8AStCf~1f@sLD<@qllYQtwsFVNSQ}I;F4uDO6nBS?XhFwc@&e*DE`EoorX!?Oz@z
zNo7aJVY{U-<MttoOaFdF@kJ^Qk1P8+4o@k49fw_t<D00&^93I}FDiZ5k^cGzrLW`j
zq0-mme>QLJ(T{F?D1}_j+#V<SmcEStWlCSyyIJY$dXtLldhb_ubiedl`cm&Vl)kR_
zJ3f4$vV-4JNSt3+T>JAc%8vHuSC+oauM>*rEBmLE9X%e#71z(DbNH21x?gzQ7AU?%
z*<YeK_Px@s3g+m?Pbt1Zaow(d#dSQN@Zo>K9DeF}?onLFbHB2q>-~|XFY)}T;`ptD
zwCkUg9UadjN+0#gyg04&b-SkFp@anf>$uHPT>C$txjiomEqxiEw<~>J?>ZkIQ+9Mb
zKd-p<=Z}>g9nZ%U*Y!T{!w*{aCH{w%zP5i<ac%!YA3g`0MiP5G&u4DOvs7_y|I^Bj
z?yt`%j`-xGB9hN4JKFy)#kKv1EIUuKokx|vw)0KJwVl7Q?Bv5Kk^@R#+xa_7|Ld&(
zrp4vHb5!v{vMcjqOxf4-;*8?j&ly;hNzhDv-<+$s8&6(;7BIK_i+@%zsehL$I|WpS
z__<u!(SBAcebg)W-#Uv+9PUzF+rP)hKK(?*6&Z&zPq$fI>_4bD-ocCi4=Jw4;bYA0
z^>&Zqy5E1S?6~bkL6Uz~9KZ3Cdf!$Yzx5LSzOt|1^PExo_;w-s)6m%@h{H|vC;T$z
za3iF6p3;Y(qJM+pS1MklxVB%WIKC~3ol3=ZyK0!jAKk8c#qrxVvD2#TXg|A^zP9s(
zkN!7(^mqH{zohi>O-B4Vsr2>xwCUJvlOX=3^e6h4Ge>*zTQ}izl)l~<d_wUXl>S1+
z^}ZmaxUP3Ob34v=D1H3)PyD%4aot~0A3NKXKKcdEMvxtfYkvln9UX@!ls<mbD*im9
zxc2AU%8u@Lw{kil{`mWunm@QEd=)Yzd|ONNlO9h0PFM7w=5-Ql0z|mH7nAR)!sR^_
z)?|qAS8+ljaS}d&;E@E}<AnYj%okdG1v->uk(Z?H-^=_)i~o!{<~~H~4e|Z~eE|{v
z67zBor}-)7w_05G1C<tkjh}<7E&dWew>cIs;SPyd{EwKgv3Mc(?^=sL#eBWR4>I3i
z@qgp|CX4@y^O9f~Ae5~P#J=!$=9@Sb-og1cizhh$ZHsT`{2q&cf%EbnUhEGu-vmG+
ze5aQ+Z5ICqbN9FMN?-V1OaEEcKV<Rendf2PkVw5Rcv&;o;@@T7X7N$x-4;K<yw~Ed
zG2d(Pqs-^>yb*ui5&ISwf8@QQ=>LZG4_W%ZV}97;@?V_aw7A4G!1GV+Psa(#B8%rR
z@3r_f%nw_9f!N`Bi7Zr#n9KYYzKnUfr7z!6DlJ~i`i{jT%-2}Fp1HiYK^FcrFyCb9
zH!*Ls_?^tVEq*uiK8tT){)oln%-^*5R^}mIpOA&0+nASIT<lj`{65xqEZ)z2jl~C;
zuebOk%pbA%5c5YZF5_ya#lOP(M=bs|=H<NZB8&Du!@Sbs-(v1qJl*dWf1dT%Tl@v)
zn=Jm<%-byfBJ%+G2Ep{^DemS?Fu8qeD!Q4{RKiQ!<SY^Ihz3*f-joT_r|75&ZcZjm
zFwun!)akzgoSUU@=x!NLg)^=;;b2D>eXdRyh$T~`9_!k=HBR5tgPmQec#wWrRg_Fc
zWA}1?OJ`58J3$|(6RGWNE#B7LieIC9&i~(!0WyIQardSTR`QwrZbmL}I3?@;KOppb
zWtw;=eG9zwJv<DOU~hu5eA`X`zoHwM^D;*{7CgL?IM#)LOD7I^E`d!;&1~WatEYLM
zN{N7+aEd(v{KB)i7AY_HGr1=1m$Z^DWGX)wn<$b8yd-^;_Dgv@XW;ytF-iHIT)rh;
z(mO${wbWB4Q~bNjJuUg(iTGn`Nd0rY%O*-G>bK)3|3R!$3b1~FQ%UgLld1kg%RLQQ
zZ&5#9Er>a(e;YB>&&{PXPb&F8n2Ek{$xDj(qP#qZik`#eF+Q|F7b5Ay>fF7+hZL9B
z4Chi#_~UefxZ3^K%jE;2$f@`(7|%7ZX_w#0<=aG&Q@i{w;+f)KUg;H(I3xbK^e5$o
zXR6yRneUa)<N1UBxh6w-^y#FrLoDxlI*|OK6)-F7t_zvUuc>s4o0q*LeYDH}4_DXF
A2><{9

literal 0
HcmV?d00001

diff --git a/data/ssb/loader/.metadata b/data/ssb/loader/.metadata
new file mode 100644
index 0000000..ab6a9e2
--- /dev/null
+++ b/data/ssb/loader/.metadata
@@ -0,0 +1,913 @@
+(dp0
+S'SUPPLIER'
+p1
+(iystree
+TableSchema
+p2
+(dp3
+S'table_name'
+p4
+g1
+sS'column_list'
+p5
+(lp6
+(iystree
+ColumnSchema
+p7
+(dp8
+S'table_schema'
+p9
+g2
+sS'column_type'
+p10
+S'INTEGER'
+p11
+sS'column_name'
+p12
+S'S_SUPPKEY'
+p13
+sba(iystree
+ColumnSchema
+p14
+(dp15
+g9
+g2
+sS'column_others'
+p16
+S'25'
+p17
+sg10
+S'TEXT'
+p18
+sg12
+S'S_NAME'
+p19
+sba(iystree
+ColumnSchema
+p20
+(dp21
+g9
+g2
+sg16
+S'25'
+p22
+sg10
+S'TEXT'
+p23
+sg12
+S'S_ADDRESS'
+p24
+sba(iystree
+ColumnSchema
+p25
+(dp26
+g9
+g2
+sg16
+S'10'
+p27
+sg10
+S'TEXT'
+p28
+sg12
+S'S_CITY'
+p29
+sba(iystree
+ColumnSchema
+p30
+(dp31
+g9
+g2
+sg16
+S'15'
+p32
+sg10
+S'TEXT'
+p33
+sg12
+S'S_NATION'
+p34
+sba(iystree
+ColumnSchema
+p35
+(dp36
+g9
+g2
+sg16
+S'12'
+p37
+sg10
+S'TEXT'
+p38
+sg12
+S'S_REGION'
+p39
+sba(iystree
+ColumnSchema
+p40
+(dp41
+g9
+g2
+sg16
+S'15'
+p42
+sg10
+S'TEXT'
+p43
+sg12
+S'S_PHONE'
+p44
+sbasS'column_name_list'
+p45
+(lp46
+g13
+ag19
+ag24
+ag29
+ag34
+ag39
+ag44
+asbsS'CUSTOMER'
+p47
+(iystree
+TableSchema
+p48
+(dp49
+g4
+g47
+sg5
+(lp50
+(iystree
+ColumnSchema
+p51
+(dp52
+g9
+g48
+sg10
+S'INTEGER'
+p53
+sg12
+S'C_CUSTKEY'
+p54
+sba(iystree
+ColumnSchema
+p55
+(dp56
+g9
+g48
+sg16
+S'25'
+p57
+sg10
+S'TEXT'
+p58
+sg12
+S'C_NAME'
+p59
+sba(iystree
+ColumnSchema
+p60
+(dp61
+g9
+g48
+sg16
+S'25'
+p62
+sg10
+S'TEXT'
+p63
+sg12
+S'C_ADDRESS'
+p64
+sba(iystree
+ColumnSchema
+p65
+(dp66
+g9
+g48
+sg16
+S'10'
+p67
+sg10
+S'TEXT'
+p68
+sg12
+S'C_CITY'
+p69
+sba(iystree
+ColumnSchema
+p70
+(dp71
+g9
+g48
+sg16
+S'15'
+p72
+sg10
+S'TEXT'
+p73
+sg12
+S'C_NATION'
+p74
+sba(iystree
+ColumnSchema
+p75
+(dp76
+g9
+g48
+sg16
+S'12'
+p77
+sg10
+S'TEXT'
+p78
+sg12
+S'C_REGION'
+p79
+sba(iystree
+ColumnSchema
+p80
+(dp81
+g9
+g48
+sg16
+S'15'
+p82
+sg10
+S'TEXT'
+p83
+sg12
+S'C_PHONE'
+p84
+sba(iystree
+ColumnSchema
+p85
+(dp86
+g9
+g48
+sg16
+S'10'
+p87
+sg10
+S'TEXT'
+p88
+sg12
+S'C_MKTSEGMENT'
+p89
+sbasg45
+(lp90
+g54
+ag59
+ag64
+ag69
+ag74
+ag79
+ag84
+ag89
+asbsS'PART'
+p91
+(iystree
+TableSchema
+p92
+(dp93
+g4
+g91
+sg5
+(lp94
+(iystree
+ColumnSchema
+p95
+(dp96
+g9
+g92
+sg10
+S'INTEGER'
+p97
+sg12
+S'P_PARTKEY'
+p98
+sba(iystree
+ColumnSchema
+p99
+(dp100
+g9
+g92
+sg16
+S'22'
+p101
+sg10
+S'TEXT'
+p102
+sg12
+S'P_NAME'
+p103
+sba(iystree
+ColumnSchema
+p104
+(dp105
+g9
+g92
+sg16
+S'6'
+p106
+sg10
+S'TEXT'
+p107
+sg12
+S'P_MFGR'
+p108
+sba(iystree
+ColumnSchema
+p109
+(dp110
+g9
+g92
+sg16
+S'7'
+p111
+sg10
+S'TEXT'
+p112
+sg12
+S'P_CATEGORY'
+p113
+sba(iystree
+ColumnSchema
+p114
+(dp115
+g9
+g92
+sg16
+S'9'
+p116
+sg10
+S'TEXT'
+p117
+sg12
+S'P_BRAND1'
+p118
+sba(iystree
+ColumnSchema
+p119
+(dp120
+g9
+g92
+sg16
+S'11'
+p121
+sg10
+S'TEXT'
+p122
+sg12
+S'P_COLOR'
+p123
+sba(iystree
+ColumnSchema
+p124
+(dp125
+g9
+g92
+sg16
+S'25'
+p126
+sg10
+S'TEXT'
+p127
+sg12
+S'P_TYPE'
+p128
+sba(iystree
+ColumnSchema
+p129
+(dp130
+g9
+g92
+sg10
+S'INTEGER'
+p131
+sg12
+S'P_SIZE'
+p132
+sba(iystree
+ColumnSchema
+p133
+(dp134
+g9
+g92
+sg16
+S'10'
+p135
+sg10
+S'TEXT'
+p136
+sg12
+S'P_CONTAINER'
+p137
+sbasg45
+(lp138
+g98
+ag103
+ag108
+ag113
+ag118
+ag123
+ag128
+ag132
+ag137
+asbsS'DDATE'
+p139
+(iystree
+TableSchema
+p140
+(dp141
+g4
+g139
+sg5
+(lp142
+(iystree
+ColumnSchema
+p143
+(dp144
+g9
+g140
+sg10
+S'DATE'
+p145
+sg12
+S'D_DATEKEY'
+p146
+sba(iystree
+ColumnSchema
+p147
+(dp148
+g9
+g140
+sg16
+S'18'
+p149
+sg10
+S'TEXT'
+p150
+sg12
+S'D_DATE'
+p151
+sba(iystree
+ColumnSchema
+p152
+(dp153
+g9
+g140
+sg16
+S'8'
+p154
+sg10
+S'TEXT'
+p155
+sg12
+S'D_DAYOFWEEK'
+p156
+sba(iystree
+ColumnSchema
+p157
+(dp158
+g9
+g140
+sg16
+g116
+sg10
+S'TEXT'
+p159
+sg12
+S'D_MONTH'
+p160
+sba(iystree
+ColumnSchema
+p161
+(dp162
+g9
+g140
+sg10
+S'INTEGER'
+p163
+sg12
+S'D_YEAR'
+p164
+sba(iystree
+ColumnSchema
+p165
+(dp166
+g9
+g140
+sg10
+S'INTEGER'
+p167
+sg12
+S'D_YEARMONTHNUM'
+p168
+sba(iystree
+ColumnSchema
+p169
+(dp170
+g9
+g140
+sg16
+g111
+sg10
+S'TEXT'
+p171
+sg12
+S'D_YEARMONTH'
+p172
+sba(iystree
+ColumnSchema
+p173
+(dp174
+g9
+g140
+sg10
+S'INTEGER'
+p175
+sg12
+S'D_DAYNUMINWEEK'
+p176
+sba(iystree
+ColumnSchema
+p177
+(dp178
+g9
+g140
+sg10
+S'INTEGER'
+p179
+sg12
+S'D_DAYNUMINMONTH'
+p180
+sba(iystree
+ColumnSchema
+p181
+(dp182
+g9
+g140
+sg10
+S'INTEGER'
+p183
+sg12
+S'D_DAYNUMINYEAR'
+p184
+sba(iystree
+ColumnSchema
+p185
+(dp186
+g9
+g140
+sg10
+S'INTEGER'
+p187
+sg12
+S'D_MONTHNUMINYEAR'
+p188
+sba(iystree
+ColumnSchema
+p189
+(dp190
+g9
+g140
+sg10
+S'INTEGER'
+p191
+sg12
+S'D_WEEKNUMINYEAR'
+p192
+sba(iystree
+ColumnSchema
+p193
+(dp194
+g9
+g140
+sg16
+S'12'
+p195
+sg10
+S'TEXT'
+p196
+sg12
+S'D_SELLINGSEASON'
+p197
+sba(iystree
+ColumnSchema
+p198
+(dp199
+g9
+g140
+sg16
+S'1'
+p200
+sg10
+S'TEXT'
+p201
+sg12
+S'D_LASTDAYINWEEKFL'
+p202
+sba(iystree
+ColumnSchema
+p203
+(dp204
+g9
+g140
+sg16
+g200
+sg10
+S'TEXT'
+p205
+sg12
+S'D_LASTDAYINMONTHFL'
+p206
+sba(iystree
+ColumnSchema
+p207
+(dp208
+g9
+g140
+sg16
+g200
+sg10
+S'TEXT'
+p209
+sg12
+S'D_HOLIDAYFL'
+p210
+sba(iystree
+ColumnSchema
+p211
+(dp212
+g9
+g140
+sg16
+g200
+sg10
+S'TEXT'
+p213
+sg12
+S'D_WEEKDAYFL'
+p214
+sbasg45
+(lp215
+g146
+ag151
+ag156
+ag160
+ag164
+ag168
+ag172
+ag176
+ag180
+ag184
+ag188
+ag192
+ag197
+ag202
+ag206
+ag210
+ag214
+asbsS'LINEORDER'
+p216
+(iystree
+TableSchema
+p217
+(dp218
+g4
+g216
+sg5
+(lp219
+(iystree
+ColumnSchema
+p220
+(dp221
+g9
+g217
+sg10
+S'INTEGER'
+p222
+sg12
+S'LO_ORDERKEY'
+p223
+sba(iystree
+ColumnSchema
+p224
+(dp225
+g9
+g217
+sg10
+S'INTEGER'
+p226
+sg12
+S'LO_LINENUMBER'
+p227
+sba(iystree
+ColumnSchema
+p228
+(dp229
+g9
+g217
+sg10
+S'INTEGER'
+p230
+sg12
+S'LO_CUSTKEY'
+p231
+sba(iystree
+ColumnSchema
+p232
+(dp233
+g9
+g217
+sg10
+S'INTEGER'
+p234
+sg12
+S'LO_PARTKEY'
+p235
+sba(iystree
+ColumnSchema
+p236
+(dp237
+g9
+g217
+sg10
+S'INTEGER'
+p238
+sg12
+S'LO_SUPPKEY'
+p239
+sba(iystree
+ColumnSchema
+p240
+(dp241
+g9
+g217
+sg10
+S'DATE'
+p242
+sg12
+S'LO_ORDERDATE'
+p243
+sba(iystree
+ColumnSchema
+p244
+(dp245
+g9
+g217
+sg16
+S'16'
+p246
+sg10
+S'TEXT'
+p247
+sg12
+S'LO_ORDERPRIORITY'
+p248
+sba(iystree
+ColumnSchema
+p249
+(dp250
+g9
+g217
+sg16
+g200
+sg10
+S'TEXT'
+p251
+sg12
+S'LO_SHIPPRIORITY'
+p252
+sba(iystree
+ColumnSchema
+p253
+(dp254
+g9
+g217
+sg10
+S'INTEGER'
+p255
+sg12
+S'LO_QUANTITY'
+p256
+sba(iystree
+ColumnSchema
+p257
+(dp258
+g9
+g217
+sg10
+S'INTEGER'
+p259
+sg12
+S'LO_EXTENDEDPRICE'
+p260
+sba(iystree
+ColumnSchema
+p261
+(dp262
+g9
+g217
+sg10
+S'INTEGER'
+p263
+sg12
+S'LO_ORDTOTALPRICE'
+p264
+sba(iystree
+ColumnSchema
+p265
+(dp266
+g9
+g217
+sg10
+S'INTEGER'
+p267
+sg12
+S'LO_DISCOUNT'
+p268
+sba(iystree
+ColumnSchema
+p269
+(dp270
+g9
+g217
+sg10
+S'INTEGER'
+p271
+sg12
+S'LO_REVENUE'
+p272
+sba(iystree
+ColumnSchema
+p273
+(dp274
+g9
+g217
+sg10
+S'INTEGER'
+p275
+sg12
+S'LO_SUPPLYCOST'
+p276
+sba(iystree
+ColumnSchema
+p277
+(dp278
+g9
+g217
+sg10
+S'INTEGER'
+p279
+sg12
+S'LO_TAX'
+p280
+sba(iystree
+ColumnSchema
+p281
+(dp282
+g9
+g217
+sg10
+S'DATE'
+p283
+sg12
+S'LO_COMMITDATE'
+p284
+sba(iystree
+ColumnSchema
+p285
+(dp286
+g9
+g217
+sg16
+S'10'
+p287
+sg10
+S'TEXT'
+p288
+sg12
+S'LO_SHIPMODE'
+p289
+sbasg45
+(lp290
+g223
+ag227
+ag231
+ag235
+ag239
+ag243
+ag248
+ag252
+ag256
+ag260
+ag264
+ag268
+ag272
+ag276
+ag280
+ag284
+ag289
+asbs.
\ No newline at end of file
diff --git a/data/ssb/loader/Makefile b/data/ssb/loader/Makefile
new file mode 100644
index 0000000..ba3503a
--- /dev/null
+++ b/data/ssb/loader/Makefile
@@ -0,0 +1,17 @@
+loader: load_modified.c
+	gcc -o loader load_modified.c
+
+original_loader: load.c
+	gcc -o gpuDBLoader load.c
+
+sort: columnSort.c
+	gcc -o columnSort columnSort.c -std=c99 
+
+rle: rle.c
+	gcc -std=c99 rle.c -o rleCompression
+
+dict: dict.c
+	gcc -std=c99 dict.c -o dictCompression
+
+clean:
+	rm -rf *.o gpuDBLoader columnSort rleCompression dictCompression 
diff --git a/data/ssb/loader/columnSort.c b/data/ssb/loader/columnSort.c
new file mode 100644
index 0000000..3f2a704
--- /dev/null
+++ b/data/ssb/loader/columnSort.c
@@ -0,0 +1,302 @@
+ 
+/*
+   Copyright (c) 2012-2013 The Ohio State University.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include "../include/common.h"
+
+/*
+ * @file columnSort.c
+ * Sort foreign key columns in LINEORDER table.
+ */
+
+struct sortObject{
+	int key;
+	int id;	
+};
+
+static void bubbleSort(struct sortObject *obj, int start,int num){
+
+	for(int i=start;i<start+num-1;i++){
+		struct sortObject tmp = obj[i];
+		int pos = i;
+		for(int j=i+1;j<start+num;j++){
+			if(obj[j].key < tmp.key){
+				tmp = obj[j];
+				pos = j;
+			}
+		}
+		obj[pos] = obj[i];
+		obj[i] = tmp;
+		
+	}
+}
+
+/*
+ * start to middle-1(inclusive) is the first part
+ * middle to end is the second part
+ * start, middle, and end are all array indexes
+ */
+
+static void mergeSort(struct sortObject *obj, int start, int middle, int end){
+
+	int firstNum = middle - start;
+	int secondNum = end - middle + 1;
+
+	if(firstNum > 1000){
+		int mid = firstNum/2;
+		mergeSort(obj, start, start+mid,middle-1);
+	}else{
+		bubbleSort(obj, start, firstNum);
+	}
+
+	if(secondNum > 1000){
+		int mid = secondNum /2;
+		mergeSort(obj, middle, middle+mid, end);
+	}else{
+		bubbleSort(obj, middle, secondNum);
+	}
+
+	struct sortObject * result = (struct sortObject *) malloc(sizeof(struct sortObject) * (end-start+1));
+	if(!result){
+		printf("Malloc failed in merge sort. Not enough memory.\n");
+		exit(-1);
+	}
+
+	int i,j,k;
+	for(i = start, j = middle, k=0; i<=middle-1 &&j<=end;){
+		if(obj[i].key < obj[j].key){
+			result[k++] = obj[i++];
+		}else{
+			result[k++] = obj[j++];
+		}
+	}
+
+	while(i<=middle-1){
+		result[k++] = obj[i++];
+	}
+
+	while(j<=end)
+		result[k++] = obj[j++];
+
+	memcpy(&obj[start],result, sizeof(struct sortObject)*(end-start+1));
+
+	free(result);
+
+}
+
+static void primarySort(struct sortObject * obj, int num){
+	int start = 0, middle = num/2, end = num-1;
+
+	mergeSort(obj,start,middle,end);
+
+}
+
+/*
+ * Input:
+ * 	@inputPrefix: the name of the table to be sorted.
+ * 	@inputPrefix: the name of the table after sorting.
+ *	@index:	the index of the column that will be sorted.
+ *	@columnNum: the total number of columns in the table.
+ *
+ * Prerequisite:
+ * 	The memory is large enough to hold each column.	
+ */
+
+int main(int argc, char **argv){
+
+	if(argc != 5){
+		printf("./columnSort inputPrefix outputPrefix index columnNum\n");
+		exit(-1);
+	}
+
+	int inFd;
+	int primaryIndex, largestIndex;
+	struct columnHeader header;
+
+	primaryIndex = atoi(argv[3]);
+	largestIndex = atoi(argv[4]); 
+
+	char buf[32] = {0};
+
+	sprintf(buf,"%s%d",argv[1],primaryIndex);
+
+	inFd = open(buf, O_RDONLY);
+	if(inFd == -1){
+		printf("Failed to open the primaryIndex column\n");
+		exit(-1);
+	}
+
+	read(inFd, &header ,sizeof(struct columnHeader));
+	if(header.format != UNCOMPRESSED){
+		printf("Cannot sort compressed data\n");
+		exit(-1);
+	}
+
+	long size = header.totalTupleNum * sizeof(int);
+	long tupleNum = header.totalTupleNum;
+	int blockTotal = header.blockTotal;
+
+	char * raw = (char *) malloc(size);
+	if(!raw){
+		printf("Malloc failed. Not enough memory\n");
+		exit(-1);
+	}
+
+	long offset = sizeof(struct columnHeader);
+	long inOffset = 0;
+	size = header.tupleNum * sizeof(int);
+
+	char *outTable =(char *) mmap(0,size,PROT_READ,MAP_SHARED,inFd, offset);
+
+	memcpy(raw + inOffset, outTable, size);
+        munmap(outTable,size);
+	offset += size;
+	inOffset += size;
+
+	for(int i=1;i<blockTotal;i++){
+		lseek(inFd, offset,SEEK_SET);
+
+		read(inFd,&header,sizeof(struct columnHeader));
+		offset += sizeof(struct columnHeader);
+		size = header.tupleNum * sizeof(int); 
+		outTable =(char *) mmap(0,size,PROT_READ,MAP_SHARED,inFd, offset);
+
+		memcpy(raw + inOffset, outTable, size);
+        	munmap(outTable,size);
+
+		offset += size;
+		inOffset += size;
+	}
+
+        close(inFd);
+
+	struct sortObject * obj = (struct sortObject *) malloc(sizeof(struct sortObject ) * tupleNum);
+
+	if(!obj){
+		printf("Malloc failed. Not enough memory!\n");
+		exit(-1);
+	}
+
+	for(int i=0;i<tupleNum;i++){
+		obj[i].key = ((int *)raw)[i];
+		obj[i].id = i;
+	}
+
+	free(raw);
+
+	primarySort(obj,tupleNum);
+
+	for(int i=0;i<= largestIndex;i++){
+
+		sprintf(buf,"%s%d",argv[1],i);
+		inFd = open(buf,O_RDONLY);
+		if(inFd == -1){
+			printf("Failed to open input column\n");
+			exit(-1);
+		}
+		size = lseek(inFd,0,SEEK_END);
+		int tupleSize = (size - blockTotal*sizeof(struct columnHeader))/tupleNum;
+
+		raw = (char *) malloc(size);
+		if(!raw){
+			printf("Malloc failed when trying to write the new result. Not enough memory !\n");
+			exit(-1);
+		}
+
+		inOffset = 0;
+		offset = 0;
+
+		for(int j=0;j<blockTotal;j++){
+			lseek(inFd, offset, SEEK_SET);
+			read(inFd, &header,sizeof(struct columnHeader));
+			offset += sizeof(struct columnHeader);
+			size = header.tupleNum * tupleSize;
+			outTable = (char *) mmap(0,size ,PROT_READ,MAP_SHARED,inFd, offset);
+
+			memcpy(raw+inOffset, outTable,size);
+			munmap(outTable,size);
+
+			offset += size;
+			inOffset += size;
+		}
+
+		close(inFd);
+
+		sprintf(buf,"%s%d",argv[2],i);
+		int outFd = open(buf, O_RDWR|O_CREAT,S_IRWXU|S_IRUSR);
+		if(outFd == -1){
+			printf("Failed to create output column\n");
+			exit(-1);
+		}
+
+		header.totalTupleNum = tupleNum;
+		header.blockId = 0;
+		header.blockTotal = blockTotal;
+		header.format = UNCOMPRESSED;
+
+		long tupleUnit, tupleRemain, tupleCount;
+		tupleRemain = tupleNum;
+		tupleCount = 0;
+
+		if(tupleNum > BLOCKNUM)
+			tupleUnit = BLOCKNUM;
+		else
+			tupleUnit = tupleNum;
+
+		header.tupleNum = tupleUnit;
+		write(outFd, &header, sizeof(struct columnHeader));
+
+		for(int j=0;j<tupleNum;j++){
+
+			int writeHeader = 0;
+			tupleCount ++;
+
+			if(tupleCount > BLOCKNUM){
+				tupleCount = 1;
+				tupleRemain -= BLOCKNUM;
+				if(tupleRemain > BLOCKNUM)
+					tupleUnit = BLOCKNUM;
+				else
+					tupleUnit = tupleRemain;
+				header.tupleNum = tupleUnit;
+				header.blockId ++;
+				writeHeader = 1;
+			}
+
+			if(writeHeader == 1){
+				write(outFd,&header, sizeof(struct columnHeader));
+			}
+
+			int id = obj[j].id;
+			write(outFd, raw+id*tupleSize, tupleSize);
+		}
+
+		free(raw);
+		close(outFd);
+	}
+
+	free(obj);
+	return 0;
+}
diff --git a/data/ssb/loader/convert.py b/data/ssb/loader/convert.py
new file mode 100644
index 0000000..db6e772
--- /dev/null
+++ b/data/ssb/loader/convert.py
@@ -0,0 +1,106 @@
+nations = """ALGERIA
+ARGENTINA
+BRAZIL
+CANADA
+EGYPT
+ETHIOPIA
+FRANCE
+GERMANY
+INDIA
+INDONESIA
+IRAN
+IRAQ
+JAPAN
+JORDAN
+KENYA
+MOROCCO
+MOZAMBIQUE
+PERU
+CHINA
+ROMANIA
+SAUDI ARABIA
+VIETNAM
+RUSSIA
+UNITED KINGDOM
+UNITED STATES"""
+
+# 25 nations
+nations = nations.split('\n')
+
+
+import argparse
+
+# 5 regions
+regions = """AFRICA
+AMERICA
+ASIA
+EUROPE
+MIDDLE EAST"""
+regions = regions.split('\n')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description = 'convert')
+    parser.add_argument('data_directory', type=str, help='Data Directory')
+    args = parser.parse_args()
+
+    data_dir = args.data_directory
+    # process suppliers
+    lines = open(data_dir + 'supplier.tbl').readlines()
+    o = []
+    for line in lines:
+        try:
+            parts = line.split('|')
+            parts[4] = str(nations.index(parts[4]))
+            parts[5] = str(regions.index(parts[5]))
+            parts[3] = str(int(parts[4]) * 10 + int(parts[3][-1]))
+            o.append('|'.join(parts))
+        except:
+            print(line)
+            break
+
+    f = open(data_dir + 'supplier.tbl.p','w')
+    for line in o:
+        f.write(line)
+    f.close()
+
+    # process customers
+    lines = open(data_dir + 'customer.tbl').readlines()
+    o = []
+    for line in lines:
+        try:
+            parts = line.split('|')
+            parts[4] = str(nations.index(parts[4]))
+            parts[5] = str(regions.index(parts[5]))
+            parts[3] = str(int(parts[4]) * 10 + int(parts[3][-1]))
+            o.append('|'.join(parts))
+        except:
+            print(line)
+            break
+
+    f = open(data_dir + 'customer.tbl.p','w')
+    for line in o:
+        f.write(line)
+    f.close()
+
+    # process parts
+    lines = open(data_dir + 'part.tbl').readlines()
+    o = []
+    for line in lines:
+        try:
+            parts = line.split('|')
+            parts[2] = int(parts[2].split('#')[-1]) - 1
+            parts[3] = parts[2] * 5 + ((int(parts[3].split('#')[-1]) % 10) - 1)
+            parts[4] = parts[3] * 40 + ((int(parts[4].split('#')[-1][2:])) - 1)
+            parts[2] = str(parts[2])
+            parts[3] = str(parts[3])
+            parts[4] = str(parts[4])
+            o.append('|'.join(parts))
+        except:
+            print(line)
+            break
+
+    f = open(data_dir + 'part.tbl.p','w')
+    for line in o:
+        f.write(line)
+    f.close()
+
diff --git a/data/ssb/loader/convert_old.py b/data/ssb/loader/convert_old.py
new file mode 100644
index 0000000..a66bbd2
--- /dev/null
+++ b/data/ssb/loader/convert_old.py
@@ -0,0 +1,102 @@
+import sys
+import argparse
+
+nations = """ALGERIA
+ARGENTINA
+BRAZIL
+CANADA
+EGYPT
+ETHIOPIA
+FRANCE
+GERMANY
+INDIA
+INDONESIA
+IRAN
+IRAQ
+JAPAN
+JORDAN
+KENYA
+MOROCCO
+MOZAMBIQUE
+PERU
+CHINA
+ROMANIA
+SAUDI ARABIA
+VIETNAM
+RUSSIA
+UNITED KINGDOM
+UNITED STATES
+"""
+nations = nations.split('\n')
+
+regions = """AFRICA
+AMERICA
+ASIA
+EUROPE
+MIDDLE EAST
+"""
+regions = regions.split('\n')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description = 'convert')
+    parser.add_argument('data_directory', type=str, help='Data Directory')
+    args = parser.parse_args()
+
+    data_dir = args.data_directory
+    # process suppliers
+    lines = open(data_dir + 'supplier.tbl').readlines()
+    o = []
+    for line in lines:
+        try:
+            parts = line.split('|')
+            parts[4] = str(nations.index(parts[4]))
+            parts[5] = str(regions.index(parts[5]))
+            parts[3] = str(int(parts[4]) * 10 + int(parts[3][-1]))
+            o.append('|'.join(parts))
+        except:
+            print(line)
+            break
+
+    f = open(data_dir + 'supplier.tbl.p','w')
+    for line in o:
+        f.write(line)
+    f.close()
+
+    # process customers
+    lines = open(data_dir + 'customer.tbl').readlines()
+    o = []
+    for line in lines:
+        try:
+            parts = line.split('|')
+            parts[4] = str(nations.index(parts[4]))
+            parts[5] = str(regions.index(parts[5]))
+            parts[3] = str(int(parts[4]) * 10 + int(parts[3][-1]))
+            o.append('|'.join(parts))
+        except:
+            print(line)
+            break
+
+    f = open(data_dir + 'customer.tbl.p','w')
+    for line in o:
+        f.write(line)
+    f.close()
+
+    # process parts
+    lines = open(data_dir + 'part.tbl').readlines()
+    o = []
+    for line in lines:
+        try:
+            parts = line.split('|')
+            parts[2] = parts[2].split('#')[-1]
+            parts[3] = parts[3].split('#')[-1]
+            parts[4] = parts[4].split('#')[-1]
+            o.append('|'.join(parts))
+        except:
+            print(line)
+            break
+
+    f = open(data_dir + 'part.tbl.p','w')
+    for line in o:
+        f.write(line)
+    f.close()
+
diff --git a/data/ssb/loader/dict.c b/data/ssb/loader/dict.c
new file mode 100644
index 0000000..a084cb0
--- /dev/null
+++ b/data/ssb/loader/dict.c
@@ -0,0 +1,235 @@
+
+/*
+   Copyright (c) 2012-2013 The Ohio State University.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include "../include/common.h"
+
+#define	HHSIZE	(1024*1024)
+
+/*
+ * @file dict.c
+ * Compress lineorder columns used in the Star Schema Benchmark using dictionary encoding.
+ */
+
+int main(int argc, char ** argv){
+
+	int res = 0;
+
+	if(argc !=3 ){
+		printf("Usage: dictCompression inputColumn outputColumn\n");
+		exit(-1);
+	}
+
+	struct columnHeader header;
+	struct dictHeader dHeader;
+
+	int inFd, outFd;
+	long size, tupleNum;
+	int numOfDistinct = 0;
+
+	inFd = open(argv[1],O_RDONLY);
+
+	if(inFd == -1){
+		printf("Failed to open the input column\n");
+		exit(-1);
+	}
+
+	read(inFd, &header, sizeof(struct columnHeader));
+
+	if(header.format != UNCOMPRESSED){
+		printf("The column has already been compressed. Nested Compression not supported yet\n");
+		exit(-1);
+	}
+
+
+	long tupleOffset = 0;
+	int blockTotal = header.blockTotal;
+	long offset = 0;
+
+	for(int j=0;j<blockTotal;j++){
+		offset = j* sizeof(struct columnHeader) + tupleOffset * sizeof(int);
+		lseek(inFd,offset,SEEK_SET);
+		read(inFd,&header, sizeof(struct columnHeader));
+		header.format = DICT;
+		offset += sizeof(struct columnHeader);
+
+		tupleNum = header.tupleNum;
+		size = tupleNum * sizeof(int);
+		char * content = (char *) malloc(size);
+
+		if(!content){
+			printf("Failed to allocate memory to accomodate the column\n");
+			exit(-1);
+		}
+
+		char *table = (char *) mmap(0,size,PROT_READ,MAP_SHARED,inFd,offset);
+		memcpy(content, table, size);
+		munmap(table,size);
+		close(inFd);
+
+		int hashTable[HHSIZE] ;
+		memset(hashTable,-1,sizeof(int) * HHSIZE);
+
+		for(int i=0;i<tupleNum;i++){
+
+			int key = ((int *)content)[i];
+			int hKey = key % HHSIZE;
+
+			if(hashTable[hKey] == -1){
+				numOfDistinct ++;
+				hashTable[hKey] = key;
+			}else{
+				if(hashTable[hKey] == key)
+					continue;
+
+				int j = 1;
+				while(hashTable[hKey] != -1 && hashTable[hKey] != key){
+					hKey = key % (HHSIZE + j*111) % HHSIZE; 
+					j = j+1;
+				}
+
+				if(hashTable[hKey] == -1){
+					hashTable[hKey] = key;
+					numOfDistinct ++;
+				}
+			}
+		}
+
+		if(numOfDistinct > MAX_DICT_NUM)
+			goto END;
+
+		int numOfBits =1 ;
+
+		while((1 << numOfBits) < numOfDistinct){
+			numOfBits ++;
+		}
+
+		while(numOfBits % 8 !=0)
+			numOfBits ++;
+
+		if(numOfBits >= sizeof(int) * 8)
+			goto END;
+
+		int stride = sizeof(int) * 8 / numOfBits;
+
+		if(stride <= 1)
+			goto END;
+
+		dHeader.dictNum = numOfDistinct;
+		dHeader.bitNum = numOfBits;
+
+		int * result = (int *) malloc(sizeof(int) * numOfDistinct);
+		if(!result){
+			printf("failed to allocate memory for result hash\n");
+			exit(-1);
+		}
+
+		memset(result, -1, sizeof(int) * numOfDistinct);
+
+		for(int i=0; i<HHSIZE;i++){
+			if(hashTable[i] == -1)
+				continue;
+
+			int key = hashTable[i];
+			int hKey = key % numOfDistinct;
+			if( result[hKey] == -1){
+				result[hKey] = key;
+			}else{
+				int j = 1;
+				while(result[hKey] !=-1){
+					hKey = key % (numOfDistinct + 111*j) % numOfDistinct;
+					j++;
+				}
+				result[hKey] = key;
+			}
+		}
+
+		for(int i=0;i<numOfDistinct;i++){
+			dHeader.hash[i] = result[i];
+		}
+
+
+		int bitInInt = sizeof(int) * 8/ stride;
+
+		outFd = open(argv[2],O_RDWR|O_CREAT);
+		if(outFd == -1){
+			printf("Failed to create output column\n");
+			exit(-1);
+		}
+
+		long outOffset = 0;
+
+		int * tmp = (int *) malloc(sizeof(int) * tupleNum);
+
+		for(int i=0; i<tupleNum; i+= stride){
+
+			int outInt = 0;
+
+			for(int k=0;k<stride;k++){
+				if((i+k) >= tupleNum)
+					break;
+
+				int key = ((int *)content)[k+i];
+				int hKey = key % numOfDistinct;
+
+				int j = 1;
+				while(result[hKey] != key){
+					hKey = key % (numOfDistinct + 111 * j) % numOfDistinct;
+					j++;
+				}
+
+				hKey = hKey & 0xFFFF;
+				memcpy((char *)(&outInt) + k*dHeader.bitNum/8, &hKey, dHeader.bitNum/8);
+
+			}
+
+			tmp[outOffset] = outInt;
+			outOffset += 1; 
+
+		}
+
+		long blockSize = (4095 + sizeof(struct dictHeader) + outOffset * sizeof(int)) /4096 * 4096;
+		header.blockSize = blockSize;
+
+		write(outFd, &header, sizeof(struct columnHeader));
+		write(outFd, &dHeader, sizeof(struct dictHeader));
+		write(outFd,tmp,outOffset*sizeof(int));
+
+		char buf[4096];
+		memset(buf,0,sizeof(buf));
+		int paddingSize = blockSize - sizeof(struct dictHeader) - outOffset * sizeof(int); 
+		write(outFd, buf, paddingSize);
+
+		close(outFd);
+		free(content);
+		free(tmp);
+		free(result);
+
+		tupleOffset += tupleNum;
+	}
+
+END:
+
+	return res;
+}
diff --git a/data/ssb/loader/include/common.h b/data/ssb/loader/include/common.h
new file mode 100644
index 0000000..3f27214
--- /dev/null
+++ b/data/ssb/loader/include/common.h
@@ -0,0 +1,245 @@
+/*
+    Copyright (c) 2012-2013 The Ohio State University.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __SSB_COMMON__
+#define __SSB_COMMON__
+
+#define BILLION     1000000000
+#define BLOCKNUM    (100*1024*1024)
+#define HSIZE 131072
+
+#define CHECK_POINTER(p)   do {                     \
+    if(p == NULL){                                  \
+        perror("Failed to allocate host memory");   \
+        exit(-1);                                   \
+    }} while(0)
+
+#define NP2(n)              do {                    \
+    n--;                                            \
+    n |= n >> 1;                                    \
+    n |= n >> 2;                                    \
+    n |= n >> 4;                                    \
+    n |= n >> 8;                                    \
+    n |= n >> 16;                                   \
+    n ++; } while (0) 
+
+
+enum {
+
+/* data format */
+    RLE = 0,
+    DICT,
+    DELTA,
+    UNCOMPRESSED,
+
+/* data type supported in schema */
+    INT,
+    FLOAT,
+    STRING,
+
+/* supported relation in exp */
+    EQ,
+    GTH,
+    LTH,
+    GEQ,
+    LEQ,
+    NOT_EQ,
+
+/* for where condition */
+    AND ,
+    OR,
+    EXP,
+    EXPSUB,                 /* the where exp is a math exp, and the column is correlated */
+
+/* supported groupby function */
+    MIN,
+    MAX,
+    COUNT,
+    SUM,
+    AVG,
+
+/* supported math operation */
+    PLUS,
+    MINUS,
+    MULTIPLY,
+    DIVIDE,
+
+/* op type for mathExp */
+    COLUMN,
+    CONS,
+
+/* data position */
+    GPU,
+    MEM,
+    PINNED,
+    UVA,
+    MMAP,
+    DISK,
+    TMPFILE,
+
+/* order by sequence */
+    ASC,
+    DESC,
+
+    NOOP
+};
+
+/* header of each block in the column */ 
+
+struct columnHeader{
+    long totalTupleNum; /* the total number of tuples in this column */
+    long tupleNum;      /* the number of tuples in this block */
+    long blockSize;     /* the size of the block in bytes */
+    int blockTotal;     /* the total number of blocks that this column is divided into */
+    int blockId;        /* the block id of the current block */
+    int format;         /* the format of the current block */
+    char padding[4060]; /* for futher use */
+};
+
+/*
+ * if the column is compressed using dictionary encoding method,
+ * distHeader will come behind the columnHeader.
+ * The size of the dictHeader varies depending on the size of the dictionary.
+ */
+
+#define MAX_DICT_NUM    30000
+
+struct dictHeader{
+    int dictNum;                /* number of distict values in the dictionary */ 
+    int bitNum;                 /* the number of bits used to compress the data */
+    int hash[MAX_DICT_NUM];     /* the hash table to store the dictionaries */
+};
+
+struct rleHeader{
+    int dictNum;
+};
+
+struct whereExp{
+    int index;
+    int relation;
+    char content[32];
+};
+
+struct whereCondition{
+    int andOr;
+    int nested;
+    int nestedRel;
+    int expNum;
+    struct whereExp *exp;
+    struct whereCondition * con;
+};
+
+struct scanNode{
+    struct tableNode *tn ;          /* the tableNode to be scanned */
+    int hasWhere;                   /* whether the node has where condition */
+    int outputNum;                  /* the number of attributes that will be projected */
+    int *outputIndex;               /* the index of projected attributes in the tableNode */
+    int whereAttrNum;               /* the number of attributes in the where condition */
+    int * whereIndex;               /* the index of each col in the table */
+    struct whereCondition * filter; /* the where conditioin */
+    int keepInGpu;                  /* whether all the results should be kept in GPU memory or not */
+
+};
+
+/*
+ * For dedup, we currently only support integers
+ */
+
+struct dedupNode{
+    struct tableNode *tn;
+    int index;                      /* the index of the column that needs to be deduped*/
+};
+
+struct vecNode{
+    struct tableNode *tn;
+    int index;
+};
+
+struct mathExp {
+    int op;             /* the math operation */
+    int opNum;          /* the number of operands */
+
+    long exp;           /* if the opNum is 2, this field stores pointer that points to the two operands whose type is mathExp */
+
+/* when opNum is 1 */
+    int opType;         /* whether it is a regular column or a constant */
+    int opValue;        /* it is the index of the column or the value of the constant */
+};
+
+struct tableNode{
+    int totalAttr;          /* the total number of attributes */
+    long tupleNum;          /* the number of tuples in the relation */
+    int tupleSize;          /* the size of each tuple */
+    int * attrType;         /* the type of each attributes */
+    int * attrSize;         /* the size of each attributes */
+    int * attrTotalSize;    /* the total size of each attribute */
+    int * attrIndex;        /* the index of each attribute in the table */
+    char **content;         /* the actual content of each attribute, organized by columns */
+    int * dataPos;          /* the position of the data, whether in disk, memory or GPU global memory */
+    int * dataFormat;       /* the format of each column */
+
+};
+
+struct groupByExp{
+    int func;               /* the group by function */
+    struct mathExp exp;     /* the math exp */ 
+};
+
+struct groupByNode{
+    struct tableNode * table;   /* the table node to be grouped by */
+
+    int groupByColNum;          /* the number of columns that will be grouped on */
+    int * groupByIndex;         /* the index of the columns that will be grouped on, -1 means group by a constant */
+    int * groupByType;          /* the type of the group by column */
+    int * groupBySize;          /* the size of the group by column */
+
+    int outputAttrNum;          /* the number of output attributes */
+    int *attrType;              /* the type of each output attribute */
+    int *attrSize;              /* the size of each output attribute */
+    struct groupByExp * gbExp;  /* the group by expression */
+
+    int tupleSize;              /* the size of the tuple in the join result */
+
+    int * keepInGpu;            /* whether the results should be kept in gpu */
+
+};
+
+struct sortRecord{
+    unsigned int key;           /* the key to be sorted */
+    unsigned int pos;           /* the position of the corresponding record */
+};
+
+struct orderByNode{
+    struct tableNode * table;   /* the table node to be ordered by */
+
+    int orderByNum;             /* the number of columns that will be ordered on */
+    int *orderBySeq;            /* asc or desc */
+    int *orderByIndex;          /* the index of each order by column in the table */
+
+};
+
+struct materializeNode{
+    struct tableNode * table;   /* the table node to be materialized */
+};
+
+struct statistic{
+    float kernel;
+    float pcie;
+    float total;
+};
+
+
+#endif
diff --git a/data/ssb/loader/include/schema.h b/data/ssb/loader/include/schema.h
new file mode 100644
index 0000000..3fa83fd
--- /dev/null
+++ b/data/ssb/loader/include/schema.h
@@ -0,0 +1,77 @@
+/* This file is generated by code_gen.py */
+#ifndef __SCHEMA_H__
+#define __SCHEMA_H__
+	struct supplier {
+		int s_suppkey;
+		char s_name[25];
+		char s_address[25];
+		char s_city[10];
+		char s_nation[15];
+		char s_region[12];
+		char s_phone[15];
+	};
+
+	struct customer {
+		int c_custkey;
+		char c_name[25];
+		char c_address[25];
+		char c_city[10];
+		char c_nation[15];
+		char c_region[12];
+		char c_phone[15];
+		char c_mktsegment[10];
+	};
+
+	struct part {
+		int p_partkey;
+		char p_name[22];
+		char p_mfgr[6];
+		char p_category[7];
+		char p_brand1[9];
+		char p_color[11];
+		char p_type[25];
+		int p_size;
+		char p_container[10];
+	};
+
+	struct ddate {
+		int d_datekey;
+		char d_date[18];
+		char d_dayofweek[8];
+		char d_month[9];
+		int d_year;
+		int d_yearmonthnum;
+		char d_yearmonth[7];
+		int d_daynuminweek;
+		int d_daynuminmonth;
+		int d_daynuminyear;
+		int d_monthnuminyear;
+		int d_weeknuminyear;
+		char d_sellingseason[12];
+		char d_lastdayinweekfl[1];
+		char d_lastdayinmonthfl[1];
+		char d_holidayfl[1];
+		char d_weekdayfl[1];
+	};
+
+	struct lineorder {
+		int lo_orderkey;
+		int lo_linenumber;
+		int lo_custkey;
+		int lo_partkey;
+		int lo_suppkey;
+		int lo_orderdate;
+		char lo_orderpriority[16];
+		char lo_shippriority[1];
+		int lo_quantity;
+		int lo_extendedprice;
+		int lo_ordtotalprice;
+		int lo_discount;
+		int lo_revenue;
+		int lo_supplycost;
+		int lo_tax;
+		int lo_commitdate;
+		char lo_shipmode[10];
+	};
+
+#endif
diff --git a/data/ssb/loader/load.c b/data/ssb/loader/load.c
new file mode 100644
index 0000000..eaadeb6
--- /dev/null
+++ b/data/ssb/loader/load.c
@@ -0,0 +1,1091 @@
+/* This file is generated by code_gen.py */
+#define _FILE_OFFSET_BITS       64
+#define _LARGEFILE_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <error.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <linux/limits.h>
+#include "../include/schema.h"
+#include "../include/common.h"
+
+#define CHECK_POINTER(p) do {\
+  if(p == NULL){   \
+    perror("Failed to allocate host memory");    \
+    exit(-1);  \
+  }} while(0)
+static char delimiter = '|';
+void supplier (FILE *fp, char *outName){
+
+  struct supplier tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[7];
+
+  for(i=0;i<7;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[0]);
+  header.blockSize = header.tupleNum * 25;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[1]);
+  header.blockSize = header.tupleNum * 25;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[2]);
+  header.blockSize = header.tupleNum * 10;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[3]);
+  header.blockSize = header.tupleNum * 15;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[4]);
+  header.blockSize = header.tupleNum * 12;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[5]);
+  header.blockSize = header.tupleNum * 15;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[6]);
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.s_suppkey = strtol(data,NULL,10);
+            fwrite(&(tmp.s_suppkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.s_name,data);
+            fwrite(&(tmp.s_name),sizeof(tmp.s_name), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            strcpy(tmp.s_address,data);
+            fwrite(&(tmp.s_address),sizeof(tmp.s_address), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            strcpy(tmp.s_city,data);
+            fwrite(&(tmp.s_city),sizeof(tmp.s_city), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            strcpy(tmp.s_nation,data);
+            fwrite(&(tmp.s_nation),sizeof(tmp.s_nation), 1, out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 12;
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            strcpy(tmp.s_region,data);
+            fwrite(&(tmp.s_region),sizeof(tmp.s_region), 1, out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.s_phone,data);
+            fwrite(&(tmp.s_phone),sizeof(tmp.s_phone), 1, out[6]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 6){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 15;
+        fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+      }
+      strncpy(tmp.s_phone,buf+prev,i-prev);
+      fwrite(&(tmp.s_phone),sizeof(tmp.s_phone), 1, out[6]);
+    }
+  }
+
+  for(i=0;i<7;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void customer (FILE *fp, char *outName){
+
+  struct customer tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[8];
+
+  for(i=0;i<8;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[0]);
+  header.blockSize = header.tupleNum * 25;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[1]);
+  header.blockSize = header.tupleNum * 25;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[2]);
+  header.blockSize = header.tupleNum * 10;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[3]);
+  header.blockSize = header.tupleNum * 15;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[4]);
+  header.blockSize = header.tupleNum * 12;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[5]);
+  header.blockSize = header.tupleNum * 15;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[6]);
+  header.blockSize = header.tupleNum * 10;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[7]);
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.c_custkey = strtol(data,NULL,10);
+            fwrite(&(tmp.c_custkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.c_name,data);
+            fwrite(&(tmp.c_name),sizeof(tmp.c_name), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            strcpy(tmp.c_address,data);
+            fwrite(&(tmp.c_address),sizeof(tmp.c_address), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            strcpy(tmp.c_city,data);
+            fwrite(&(tmp.c_city),sizeof(tmp.c_city), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            strcpy(tmp.c_nation,data);
+            fwrite(&(tmp.c_nation),sizeof(tmp.c_nation), 1, out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 12;
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            strcpy(tmp.c_region,data);
+            fwrite(&(tmp.c_region),sizeof(tmp.c_region), 1, out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.c_phone,data);
+            fwrite(&(tmp.c_phone),sizeof(tmp.c_phone), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            strcpy(tmp.c_mktsegment,data);
+            fwrite(&(tmp.c_mktsegment),sizeof(tmp.c_mktsegment), 1, out[7]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 7){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 10;
+        fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+      }
+      strncpy(tmp.c_mktsegment,buf+prev,i-prev);
+      fwrite(&(tmp.c_mktsegment),sizeof(tmp.c_mktsegment), 1, out[7]);
+    }
+  }
+
+  for(i=0;i<8;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void part (FILE *fp, char *outName){
+
+  struct part tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[9];
+
+  for(i=0;i<9;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[0]);
+  header.blockSize = header.tupleNum * 22;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[1]);
+  header.blockSize = header.tupleNum * 6;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[2]);
+  header.blockSize = header.tupleNum * 7;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[3]);
+  header.blockSize = header.tupleNum * 9;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[4]);
+  header.blockSize = header.tupleNum * 11;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[5]);
+  header.blockSize = header.tupleNum * 25;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[6]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[7]);
+  header.blockSize = header.tupleNum * 10;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[8]);
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.p_partkey = strtol(data,NULL,10);
+            fwrite(&(tmp.p_partkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 22;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.p_name,data);
+            fwrite(&(tmp.p_name),sizeof(tmp.p_name), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 6;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            strcpy(tmp.p_mfgr,data);
+            fwrite(&(tmp.p_mfgr),sizeof(tmp.p_mfgr), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 7;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            strcpy(tmp.p_category,data);
+            fwrite(&(tmp.p_category),sizeof(tmp.p_category), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 9;
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            strcpy(tmp.p_brand1,data);
+            fwrite(&(tmp.p_brand1),sizeof(tmp.p_brand1), 1, out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 11;
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            strcpy(tmp.p_color,data);
+            fwrite(&(tmp.p_color),sizeof(tmp.p_color), 1, out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.p_type,data);
+            fwrite(&(tmp.p_type),sizeof(tmp.p_type), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            tmp.p_size = strtol(data,NULL,10);
+            fwrite(&(tmp.p_size),sizeof(int),1,out[7]);
+            break;
+           case 8:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+            }
+            strcpy(tmp.p_container,data);
+            fwrite(&(tmp.p_container),sizeof(tmp.p_container), 1, out[8]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 8){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 10;
+        fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+      }
+      strncpy(tmp.p_container,buf+prev,i-prev);
+      fwrite(&(tmp.p_container),sizeof(tmp.p_container), 1, out[8]);
+    }
+  }
+
+  for(i=0;i<9;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void ddate (FILE *fp, char *outName){
+
+  struct ddate tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[17];
+
+  for(i=0;i<17;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[0]);
+  header.blockSize = header.tupleNum * 18;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[1]);
+  header.blockSize = header.tupleNum * 8;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[2]);
+  header.blockSize = header.tupleNum * 9;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[3]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[4]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[5]);
+  header.blockSize = header.tupleNum * 7;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[6]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[7]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[8]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[9]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[10]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[11]);
+  header.blockSize = header.tupleNum * 12;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[12]);
+  header.blockSize = header.tupleNum * 1;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[13]);
+  header.blockSize = header.tupleNum * 1;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[14]);
+  header.blockSize = header.tupleNum * 1;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[15]);
+  header.blockSize = header.tupleNum * 1;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[16]);
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.d_datekey = strtol(data,NULL,10);
+            fwrite(&(tmp.d_datekey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 18;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.d_date,data);
+            fwrite(&(tmp.d_date),sizeof(tmp.d_date), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 8;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            strcpy(tmp.d_dayofweek,data);
+            fwrite(&(tmp.d_dayofweek),sizeof(tmp.d_dayofweek), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 9;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            strcpy(tmp.d_month,data);
+            fwrite(&(tmp.d_month),sizeof(tmp.d_month), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            tmp.d_year = strtol(data,NULL,10);
+            fwrite(&(tmp.d_year),sizeof(int),1,out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            tmp.d_yearmonthnum = strtol(data,NULL,10);
+            fwrite(&(tmp.d_yearmonthnum),sizeof(int),1,out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 7;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.d_yearmonth,data);
+            fwrite(&(tmp.d_yearmonth),sizeof(tmp.d_yearmonth), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            tmp.d_daynuminweek = strtol(data,NULL,10);
+            fwrite(&(tmp.d_daynuminweek),sizeof(int),1,out[7]);
+            break;
+           case 8:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+            }
+            tmp.d_daynuminmonth = strtol(data,NULL,10);
+            fwrite(&(tmp.d_daynuminmonth),sizeof(int),1,out[8]);
+            break;
+           case 9:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[9]);
+            }
+            tmp.d_daynuminyear = strtol(data,NULL,10);
+            fwrite(&(tmp.d_daynuminyear),sizeof(int),1,out[9]);
+            break;
+           case 10:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[10]);
+            }
+            tmp.d_monthnuminyear = strtol(data,NULL,10);
+            fwrite(&(tmp.d_monthnuminyear),sizeof(int),1,out[10]);
+            break;
+           case 11:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[11]);
+            }
+            tmp.d_weeknuminyear = strtol(data,NULL,10);
+            fwrite(&(tmp.d_weeknuminyear),sizeof(int),1,out[11]);
+            break;
+           case 12:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 12;
+              fwrite(&header,sizeof(struct columnHeader),1,out[12]);
+            }
+            strcpy(tmp.d_sellingseason,data);
+            fwrite(&(tmp.d_sellingseason),sizeof(tmp.d_sellingseason), 1, out[12]);
+            break;
+           case 13:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[13]);
+            }
+            strcpy(tmp.d_lastdayinweekfl,data);
+            fwrite(&(tmp.d_lastdayinweekfl),sizeof(tmp.d_lastdayinweekfl), 1, out[13]);
+            break;
+           case 14:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[14]);
+            }
+            strcpy(tmp.d_lastdayinmonthfl,data);
+            fwrite(&(tmp.d_lastdayinmonthfl),sizeof(tmp.d_lastdayinmonthfl), 1, out[14]);
+            break;
+           case 15:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[15]);
+            }
+            strcpy(tmp.d_holidayfl,data);
+            fwrite(&(tmp.d_holidayfl),sizeof(tmp.d_holidayfl), 1, out[15]);
+            break;
+           case 16:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+            }
+            strcpy(tmp.d_weekdayfl,data);
+            fwrite(&(tmp.d_weekdayfl),sizeof(tmp.d_weekdayfl), 1, out[16]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 16){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 1;
+        fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+      }
+      strncpy(tmp.d_weekdayfl,buf+prev,i-prev);
+      fwrite(&(tmp.d_weekdayfl),sizeof(tmp.d_weekdayfl), 1, out[16]);
+    }
+  }
+
+  for(i=0;i<17;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void lineorder (FILE *fp, char *outName){
+
+  struct lineorder tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[17];
+
+  for(i=0;i<17;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[0]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[1]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[2]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[3]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[4]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[5]);
+  header.blockSize = header.tupleNum * 16;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[6]);
+  header.blockSize = header.tupleNum * 1;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[7]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[8]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[9]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[10]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[11]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[12]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[13]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[14]);
+  header.blockSize = header.tupleNum * sizeof(int);
+  fwrite(&header, sizeof(struct columnHeader), 1, out[15]);
+  header.blockSize = header.tupleNum * 10;
+  fwrite(&header, sizeof(struct columnHeader), 1, out[16]);
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.lo_orderkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_orderkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            tmp.lo_linenumber = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_linenumber),sizeof(int),1,out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            tmp.lo_custkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_custkey),sizeof(int),1,out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            tmp.lo_partkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_partkey),sizeof(int),1,out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            tmp.lo_suppkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_suppkey),sizeof(int),1,out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            tmp.lo_orderdate = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_orderdate),sizeof(int),1,out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 16;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.lo_orderpriority,data);
+            fwrite(&(tmp.lo_orderpriority),sizeof(tmp.lo_orderpriority), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            strcpy(tmp.lo_shippriority,data);
+            fwrite(&(tmp.lo_shippriority),sizeof(tmp.lo_shippriority), 1, out[7]);
+            break;
+           case 8:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+            }
+            tmp.lo_quantity = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_quantity),sizeof(int),1,out[8]);
+            break;
+           case 9:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[9]);
+            }
+            tmp.lo_extendedprice = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_extendedprice),sizeof(int),1,out[9]);
+            break;
+           case 10:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[10]);
+            }
+            tmp.lo_ordtotalprice = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_ordtotalprice),sizeof(int),1,out[10]);
+            break;
+           case 11:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[11]);
+            }
+            tmp.lo_discount = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_discount),sizeof(int),1,out[11]);
+            break;
+           case 12:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[12]);
+            }
+            tmp.lo_revenue = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_revenue),sizeof(int),1,out[12]);
+            break;
+           case 13:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[13]);
+            }
+            tmp.lo_supplycost = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_supplycost),sizeof(int),1,out[13]);
+            break;
+           case 14:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[14]);
+            }
+            tmp.lo_tax = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_tax),sizeof(int),1,out[14]);
+            break;
+           case 15:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[15]);
+            }
+            tmp.lo_commitdate = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_commitdate),sizeof(int),1,out[15]);
+            break;
+           case 16:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+            }
+            strcpy(tmp.lo_shipmode,data);
+            fwrite(&(tmp.lo_shipmode),sizeof(tmp.lo_shipmode), 1, out[16]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 16){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 10;
+        fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+      }
+      strncpy(tmp.lo_shipmode,buf+prev,i-prev);
+      fwrite(&(tmp.lo_shipmode),sizeof(tmp.lo_shipmode), 1, out[16]);
+    }
+  }
+
+  for(i=0;i<17;i++){
+    fclose(out[i]);
+  }
+
+}
+
+int main(int argc, char ** argv){
+
+  FILE * in = NULL, *out = NULL;
+  int table;
+  int setPath = 0;
+  char path[PATH_MAX];
+  char cwd[PATH_MAX];
+
+  int long_index;
+  struct option long_options[] = {
+    {"supplier",required_argument,0,'0'},
+    {"customer",required_argument,0,'1'},
+    {"part",required_argument,0,'2'},
+    {"ddate",required_argument,0,'3'},
+    {"lineorder",required_argument,0,'4'},
+    {"delimiter",required_argument,0,'5'},
+    {"datadir",required_argument,0,'6'}
+  };
+
+  while((table=getopt_long(argc,argv,"",long_options,&long_index))!=-1){
+    switch(table){
+      case '6':
+        setPath = 1;
+        strcpy(path,optarg);
+        break;
+    }
+  }
+
+  optind=1;
+
+  getcwd(cwd,PATH_MAX);
+  while((table=getopt_long(argc,argv,"",long_options,&long_index))!=-1){
+    switch(table){
+      case '0':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        supplier(in,"SUPPLIER");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '1':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        customer(in,"CUSTOMER");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '2':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        part(in,"PART");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '3':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        ddate(in,"DDATE");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '4':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        lineorder(in,"LINEORDER");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '5':
+        delimiter = optarg[0];
+        break;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/data/ssb/loader/load_modified.c b/data/ssb/loader/load_modified.c
new file mode 100644
index 0000000..5c180ed
--- /dev/null
+++ b/data/ssb/loader/load_modified.c
@@ -0,0 +1,1096 @@
+/* This file is generated by code_gen.py */
+#define _FILE_OFFSET_BITS       64
+#define _LARGEFILE_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <error.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <linux/limits.h>
+#include "include/schema.h"
+#include "include/common.h"
+
+#define CHECK_POINTER(p) do {\
+  if(p == NULL){   \
+    perror("Failed to allocate host memory");    \
+    exit(-1);  \
+  }} while(0)
+
+static char delimiter = '|';
+
+void supplier (FILE *fp, char *outName){
+  struct supplier tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[7];
+
+  for(i=0;i<7;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+/*  header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[0]);*/
+  /*header.blockSize = header.tupleNum * 25;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[1]);*/
+  /*header.blockSize = header.tupleNum * 25;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[2]);*/
+  /*header.blockSize = header.tupleNum * 10;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[3]);*/
+  /*header.blockSize = header.tupleNum * 15;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[4]);*/
+  /*header.blockSize = header.tupleNum * 12;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[5]);*/
+  /*header.blockSize = header.tupleNum * 15;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[6]);*/
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.s_suppkey = strtol(data,NULL,10);
+            fwrite(&(tmp.s_suppkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.s_name,data);
+            fwrite(&(tmp.s_name),sizeof(tmp.s_name), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            strcpy(tmp.s_address,data);
+            fwrite(&(tmp.s_address),sizeof(tmp.s_address), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            int s_city = strtol(data, NULL, 10);
+            /*strcpy(tmp.s_city,data);*/
+            fwrite(&(s_city),sizeof(int), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            int s_nation = strtol(data, NULL, 10);
+            /*strcpy(tmp.s_nation,data);*/
+            fwrite(&(s_nation),sizeof(int), 1, out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 12;
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            int s_region = strtol(data, NULL, 10);
+            /*strcpy(tmp.s_region,data);*/
+            fwrite(&(s_region),sizeof(int), 1, out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.s_phone,data);
+            fwrite(&(tmp.s_phone),sizeof(tmp.s_phone), 1, out[6]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 6){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 15;
+        fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+      }
+      strncpy(tmp.s_phone,buf+prev,i-prev);
+      fwrite(&(tmp.s_phone),sizeof(tmp.s_phone), 1, out[6]);
+    }
+  }
+
+  for(i=0;i<7;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void customer (FILE *fp, char *outName){
+  struct customer tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[8];
+
+  for(i=0;i<8;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+/*  header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[0]);*/
+  /*header.blockSize = header.tupleNum * 25;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[1]);*/
+  /*header.blockSize = header.tupleNum * 25;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[2]);*/
+  /*header.blockSize = header.tupleNum * 10;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[3]);*/
+  /*header.blockSize = header.tupleNum * 15;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[4]);*/
+  /*header.blockSize = header.tupleNum * 12;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[5]);*/
+  /*header.blockSize = header.tupleNum * 15;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[6]);*/
+  /*header.blockSize = header.tupleNum * 10;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[7]);*/
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.c_custkey = strtol(data,NULL,10);
+            fwrite(&(tmp.c_custkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.c_name,data);
+            fwrite(&(tmp.c_name),sizeof(tmp.c_name), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            strcpy(tmp.c_address,data);
+            fwrite(&(tmp.c_address),sizeof(tmp.c_address), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            int c_city = strtol(data, NULL, 10);
+            /*strcpy(tmp.c_city,data);*/
+            fwrite(&(c_city),sizeof(int), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            int c_nation = strtol(data, NULL, 10);
+            /*strcpy(tmp.c_nation,data);*/
+            fwrite(&(c_nation),sizeof(int), 1, out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 12;
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            int c_region= strtol(data, NULL, 10);
+            /*strcpy(tmp.c_region,data);*/
+            fwrite(&(c_region),sizeof(int), 1, out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 15;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.c_phone,data);
+            fwrite(&(tmp.c_phone),sizeof(tmp.c_phone), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            strcpy(tmp.c_mktsegment,data);
+            fwrite(&(tmp.c_mktsegment),sizeof(tmp.c_mktsegment), 1, out[7]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 7){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 10;
+        fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+      }
+      strncpy(tmp.c_mktsegment,buf+prev,i-prev);
+      fwrite(&(tmp.c_mktsegment),sizeof(tmp.c_mktsegment), 1, out[7]);
+    }
+  }
+
+  for(i=0;i<8;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void part (FILE *fp, char *outName){
+  struct part tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[9];
+
+  for(i=0;i<9;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+/*  header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[0]);*/
+  /*header.blockSize = header.tupleNum * 22;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[1]);*/
+  /*header.blockSize = header.tupleNum * 6;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[2]);*/
+  /*header.blockSize = header.tupleNum * 7;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[3]);*/
+  /*header.blockSize = header.tupleNum * 9;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[4]);*/
+  /*header.blockSize = header.tupleNum * 11;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[5]);*/
+  /*header.blockSize = header.tupleNum * 25;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[6]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[7]);*/
+  /*header.blockSize = header.tupleNum * 10;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[8]);*/
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.p_partkey = strtol(data,NULL,10);
+            fwrite(&(tmp.p_partkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 22;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.p_name,data);
+            fwrite(&(tmp.p_name),sizeof(tmp.p_name), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 6;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            int p_mfgr = strtol(data, NULL, 10);
+            /*strcpy(tmp.p_mfgr,data);*/
+            fwrite(&(p_mfgr),sizeof(int), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 7;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            int p_category = strtol(data, NULL, 10);
+            /*strcpy(tmp.p_category,data);*/
+            fwrite(&(p_category),sizeof(int), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 9;
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            int p_brand1 = strtol(data, NULL, 10);
+            /*strcpy(tmp.p_brand1,data);*/
+            fwrite(&(p_brand1),sizeof(int), 1, out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 11;
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            strcpy(tmp.p_color,data);
+            fwrite(&(tmp.p_color),sizeof(tmp.p_color), 1, out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 25;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.p_type,data);
+            fwrite(&(tmp.p_type),sizeof(tmp.p_type), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            tmp.p_size = strtol(data,NULL,10);
+            fwrite(&(tmp.p_size),sizeof(int),1,out[7]);
+            break;
+           case 8:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+            }
+            strcpy(tmp.p_container,data);
+            fwrite(&(tmp.p_container),sizeof(tmp.p_container), 1, out[8]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 8){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 10;
+        fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+      }
+      strncpy(tmp.p_container,buf+prev,i-prev);
+      fwrite(&(tmp.p_container),sizeof(tmp.p_container), 1, out[8]);
+    }
+  }
+
+  for(i=0;i<9;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void ddate (FILE *fp, char *outName){
+  struct ddate tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[17];
+
+  for(i=0;i<17;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+/*  header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[0]);*/
+  /*header.blockSize = header.tupleNum * 18;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[1]);*/
+  /*header.blockSize = header.tupleNum * 8;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[2]);*/
+  /*header.blockSize = header.tupleNum * 9;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[3]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[4]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[5]);*/
+  /*header.blockSize = header.tupleNum * 7;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[6]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[7]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[8]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[9]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[10]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[11]);*/
+  /*header.blockSize = header.tupleNum * 12;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[12]);*/
+  /*header.blockSize = header.tupleNum * 1;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[13]);*/
+  /*header.blockSize = header.tupleNum * 1;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[14]);*/
+  /*header.blockSize = header.tupleNum * 1;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[15]);*/
+  /*header.blockSize = header.tupleNum * 1;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[16]);*/
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.d_datekey = strtol(data,NULL,10);
+            fwrite(&(tmp.d_datekey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 18;
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            strcpy(tmp.d_date,data);
+            fwrite(&(tmp.d_date),sizeof(tmp.d_date), 1, out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 8;
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            strcpy(tmp.d_dayofweek,data);
+            fwrite(&(tmp.d_dayofweek),sizeof(tmp.d_dayofweek), 1, out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 9;
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            strcpy(tmp.d_month,data);
+            fwrite(&(tmp.d_month),sizeof(tmp.d_month), 1, out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            tmp.d_year = strtol(data,NULL,10);
+            fwrite(&(tmp.d_year),sizeof(int),1,out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            tmp.d_yearmonthnum = strtol(data,NULL,10);
+            fwrite(&(tmp.d_yearmonthnum),sizeof(int),1,out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 7;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.d_yearmonth,data);
+            fwrite(&(tmp.d_yearmonth),sizeof(tmp.d_yearmonth), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            tmp.d_daynuminweek = strtol(data,NULL,10);
+            fwrite(&(tmp.d_daynuminweek),sizeof(int),1,out[7]);
+            break;
+           case 8:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+            }
+            tmp.d_daynuminmonth = strtol(data,NULL,10);
+            fwrite(&(tmp.d_daynuminmonth),sizeof(int),1,out[8]);
+            break;
+           case 9:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[9]);
+            }
+            tmp.d_daynuminyear = strtol(data,NULL,10);
+            fwrite(&(tmp.d_daynuminyear),sizeof(int),1,out[9]);
+            break;
+           case 10:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[10]);
+            }
+            tmp.d_monthnuminyear = strtol(data,NULL,10);
+            fwrite(&(tmp.d_monthnuminyear),sizeof(int),1,out[10]);
+            break;
+           case 11:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[11]);
+            }
+            tmp.d_weeknuminyear = strtol(data,NULL,10);
+            fwrite(&(tmp.d_weeknuminyear),sizeof(int),1,out[11]);
+            break;
+           case 12:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 12;
+              fwrite(&header,sizeof(struct columnHeader),1,out[12]);
+            }
+            strcpy(tmp.d_sellingseason,data);
+            fwrite(&(tmp.d_sellingseason),sizeof(tmp.d_sellingseason), 1, out[12]);
+            break;
+           case 13:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[13]);
+            }
+            strcpy(tmp.d_lastdayinweekfl,data);
+            fwrite(&(tmp.d_lastdayinweekfl),sizeof(tmp.d_lastdayinweekfl), 1, out[13]);
+            break;
+           case 14:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[14]);
+            }
+            strcpy(tmp.d_lastdayinmonthfl,data);
+            fwrite(&(tmp.d_lastdayinmonthfl),sizeof(tmp.d_lastdayinmonthfl), 1, out[14]);
+            break;
+           case 15:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[15]);
+            }
+            strcpy(tmp.d_holidayfl,data);
+            fwrite(&(tmp.d_holidayfl),sizeof(tmp.d_holidayfl), 1, out[15]);
+            break;
+           case 16:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+            }
+            strcpy(tmp.d_weekdayfl,data);
+            fwrite(&(tmp.d_weekdayfl),sizeof(tmp.d_weekdayfl), 1, out[16]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 16){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 1;
+        fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+      }
+      strncpy(tmp.d_weekdayfl,buf+prev,i-prev);
+      fwrite(&(tmp.d_weekdayfl),sizeof(tmp.d_weekdayfl), 1, out[16]);
+    }
+  }
+
+  for(i=0;i<17;i++){
+    fclose(out[i]);
+  }
+
+}
+
+void lineorder (FILE *fp, char *outName){
+  struct lineorder tmp;
+  char data [1024] = {0};
+  char buf[1024] = {0};
+  int count = 0, i = 0,prev = 0;
+  long tupleCount =0, tupleRemain = 0, tupleUnit = 0;
+  FILE * out[17];
+
+  for(i=0;i<17;i++){
+    char path[PATH_MAX] = {0};
+    sprintf(path,"%s%d",outName,i);
+    out[i] = fopen(path, "w");
+    if(!out[i]){
+      printf("Failed to open %s\n",path);
+      exit(-1);
+    }
+  }
+
+  struct columnHeader header;
+  long tupleNum = 0;
+  while(fgets(buf,sizeof(buf),fp) !=NULL)
+    tupleNum ++;
+
+  header.totalTupleNum = tupleNum;
+  tupleRemain = tupleNum;
+  if(tupleNum > BLOCKNUM)
+    tupleUnit = BLOCKNUM;
+  else
+    tupleUnit = tupleNum;
+  header.tupleNum = tupleUnit;
+  header.format = UNCOMPRESSED;
+  header.blockId = 0;
+  header.blockTotal = (tupleNum + BLOCKNUM -1) / BLOCKNUM ;
+  fseek(fp,0,SEEK_SET);
+/*  header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[0]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[1]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[2]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[3]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[4]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[5]);*/
+  /*header.blockSize = header.tupleNum * 16;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[6]);*/
+  /*header.blockSize = header.tupleNum * 1;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[7]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[8]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[9]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[10]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[11]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[12]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[13]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[14]);*/
+  /*header.blockSize = header.tupleNum * sizeof(int);*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[15]);*/
+  /*header.blockSize = header.tupleNum * 10;*/
+  /*fwrite(&header, sizeof(struct columnHeader), 1, out[16]);*/
+  while(fgets(buf,sizeof(buf),fp)!= NULL){
+    int writeHeader = 0;
+    tupleCount ++;
+    if(tupleCount > BLOCKNUM){
+      tupleCount = 1;
+      tupleRemain -= BLOCKNUM;
+      if (tupleRemain > BLOCKNUM)
+        tupleUnit = BLOCKNUM;
+      else
+        tupleUnit = tupleRemain;
+      header.tupleNum = tupleUnit;
+      header.blockId ++;
+      writeHeader = 1;
+    }
+    for(i = 0, prev = 0,count=0; buf[i] !='\n';i++){
+      if (buf[i] == delimiter){
+        memset(data,0,sizeof(data));
+        strncpy(data,buf+prev,i-prev);
+        prev = i+1;
+        switch(count){
+           case 0:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[0]);
+            }
+            tmp.lo_orderkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_orderkey),sizeof(int),1,out[0]);
+            break;
+           case 1:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[1]);
+            }
+            tmp.lo_linenumber = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_linenumber),sizeof(int),1,out[1]);
+            break;
+           case 2:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[2]);
+            }
+            tmp.lo_custkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_custkey),sizeof(int),1,out[2]);
+            break;
+           case 3:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[3]);
+            }
+            tmp.lo_partkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_partkey),sizeof(int),1,out[3]);
+            break;
+           case 4:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[4]);
+            }
+            tmp.lo_suppkey = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_suppkey),sizeof(int),1,out[4]);
+            break;
+           case 5:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[5]);
+            }
+            tmp.lo_orderdate = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_orderdate),sizeof(int),1,out[5]);
+            break;
+           case 6:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 16;
+              fwrite(&header,sizeof(struct columnHeader),1,out[6]);
+            }
+            strcpy(tmp.lo_orderpriority,data);
+            fwrite(&(tmp.lo_orderpriority),sizeof(tmp.lo_orderpriority), 1, out[6]);
+            break;
+           case 7:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 1;
+              fwrite(&header,sizeof(struct columnHeader),1,out[7]);
+            }
+            strcpy(tmp.lo_shippriority,data);
+            fwrite(&(tmp.lo_shippriority),sizeof(tmp.lo_shippriority), 1, out[7]);
+            break;
+           case 8:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[8]);
+            }
+            tmp.lo_quantity = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_quantity),sizeof(int),1,out[8]);
+            break;
+           case 9:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[9]);
+            }
+            tmp.lo_extendedprice = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_extendedprice),sizeof(int),1,out[9]);
+            break;
+           case 10:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[10]);
+            }
+            tmp.lo_ordtotalprice = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_ordtotalprice),sizeof(int),1,out[10]);
+            break;
+           case 11:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[11]);
+            }
+            tmp.lo_discount = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_discount),sizeof(int),1,out[11]);
+            break;
+           case 12:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[12]);
+            }
+            tmp.lo_revenue = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_revenue),sizeof(int),1,out[12]);
+            break;
+           case 13:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[13]);
+            }
+            tmp.lo_supplycost = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_supplycost),sizeof(int),1,out[13]);
+            break;
+           case 14:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[14]);
+            }
+            tmp.lo_tax = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_tax),sizeof(int),1,out[14]);
+            break;
+           case 15:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * sizeof(int);
+              fwrite(&header,sizeof(struct columnHeader),1,out[15]);
+            }
+            tmp.lo_commitdate = strtol(data,NULL,10);
+            fwrite(&(tmp.lo_commitdate),sizeof(int),1,out[15]);
+            break;
+           case 16:
+            if(writeHeader == 1){
+              header.blockSize = header.tupleNum * 10;
+              fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+            }
+            strcpy(tmp.lo_shipmode,data);
+            fwrite(&(tmp.lo_shipmode),sizeof(tmp.lo_shipmode), 1, out[16]);
+            break;
+        }
+        count++;
+      }
+    }
+    if(count == 16){
+      if(writeHeader == 1){
+        header.blockSize = header.tupleNum * 10;
+        fwrite(&header,sizeof(struct columnHeader),1,out[16]);
+      }
+      strncpy(tmp.lo_shipmode,buf+prev,i-prev);
+      fwrite(&(tmp.lo_shipmode),sizeof(tmp.lo_shipmode), 1, out[16]);
+    }
+  }
+
+  for(i=0;i<17;i++){
+    fclose(out[i]);
+  }
+
+}
+
+int main(int argc, char ** argv){
+  FILE * in = NULL, *out = NULL;
+  int table;
+  int setPath = 0;
+  char path[PATH_MAX];
+  char cwd[PATH_MAX];
+
+  int long_index;
+  struct option long_options[] = {
+    {"supplier",required_argument,0,'0'},
+    {"customer",required_argument,0,'1'},
+    {"part",required_argument,0,'2'},
+    {"ddate",required_argument,0,'3'},
+    {"lineorder",required_argument,0,'4'},
+    {"delimiter",required_argument,0,'5'},
+    {"datadir",required_argument,0,'6'}
+  };
+
+  while((table=getopt_long(argc,argv,"",long_options,&long_index))!=-1){
+    switch(table){
+      case '6':
+        setPath = 1;
+        strcpy(path,optarg);
+        break;
+    }
+  }
+
+  optind=1;
+
+  getcwd(cwd,PATH_MAX);
+  while((table=getopt_long(argc,argv,"",long_options,&long_index))!=-1){
+    switch(table){
+      case '0':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        supplier(in,"SUPPLIER");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '1':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        customer(in,"CUSTOMER");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '2':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        part(in,"PART");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '3':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        ddate(in,"DDATE");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '4':
+        in = fopen(optarg,"r");
+        if(!in){
+          printf("Failed to open %s\n",optarg);
+          exit(-1);
+        }
+        if (setPath == 1){
+          chdir(path);
+        }
+        lineorder(in,"LINEORDER");
+        if (setPath == 1){
+          chdir(cwd);
+        }
+        fclose(in);
+        break;
+      case '5':
+        delimiter = optarg[0];
+        break;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/data/ssb/loader/rle.c b/data/ssb/loader/rle.c
new file mode 100644
index 0000000..403f74b
--- /dev/null
+++ b/data/ssb/loader/rle.c
@@ -0,0 +1,151 @@
+/*
+   Copyright (c) 2012-2013 The Ohio State University.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "../include/common.h"
+
+/*
+ * @file rle.c
+ * Compress a sorted foreign key column in LINEORDER table using Run Length encoding.
+ *
+ * Input:
+ * 	@inputColumn: the column to be compressed using RLE.
+ * 	@outputColumn: the name of the compressed column.
+ */
+
+int main(int argc, char ** argv){
+
+	if(argc != 3){
+		printf("./rleCompresssion inputColumn outputColumn\n");
+		exit(-1);
+	}
+
+	int inFd = open(argv[1],O_RDONLY);
+	if(inFd == -1){
+		printf("Failed to open input file\n");
+		exit(-1);
+	}
+
+	int outFd = open(argv[2],O_RDWR|O_CREAT);
+	if(outFd == -1){
+		printf("Failed to create output column\n");
+		exit(-1);
+	}
+
+	struct columnHeader header;
+	read(inFd, &header, sizeof(struct columnHeader));
+
+	int blockTotal = header.blockTotal;
+
+	long tupleOffset = 0;
+	long offset = 0;
+
+	for(int i=0;i<blockTotal;i++){
+		offset = i*sizeof(struct columnHeader) + tupleOffset * sizeof(int);
+		lseek(inFd,offset,SEEK_SET);
+		read(inFd, &header, sizeof(struct columnHeader));
+		offset += sizeof(struct columnHeader);
+		long tupleNum = header.tupleNum;
+		long size = tupleNum * sizeof(int);
+        	char *content = (char *) malloc(size);
+        	char *table =(char *) mmap(0,size,PROT_READ,MAP_SHARED,inFd,offset);
+        	memcpy(content,table,size);
+        	munmap(table,size);
+        	close(inFd);
+
+		tupleOffset += tupleNum;
+
+		header.blockId = i;
+		header.format = RLE;
+
+		struct rleHeader rheader;
+
+		int distinct = 1;
+		int prev = ((int *)content)[0], curr;
+
+		for(long i=1;i<tupleNum;i++){
+			curr = ((int *)content)[i];
+			if(curr == prev){
+				continue;
+			}
+			distinct ++;
+			prev = curr;
+		}
+
+		rheader.dictNum = distinct;
+		int * dictValue = (int *)malloc(sizeof(int) * distinct);
+		int * dictCount = (int *)malloc(sizeof(int) * distinct);
+		int * dictPos = (int *)malloc(sizeof(int) * distinct);
+		if(!dictPos || !dictCount || !dictValue){
+			printf("Failed to allocate memory\n");
+			exit(-1);
+		}
+
+		prev = ((int *)content)[0];
+		int count = 1;
+		int pos = 0;
+		int k=0;
+		for(long i =1; i<tupleNum; i++){
+			curr = ((int *)content)[i];
+			if(curr == prev){
+				count ++;
+				continue;
+			}
+			dictValue[k] = prev;
+			dictPos[k] = pos;
+			dictCount[k] = count;
+			pos += count;
+			k++;
+			prev = curr;
+			count = 1;
+		}
+		dictValue[k] = prev;
+		dictPos[k] = pos;
+		dictCount[k] = count;
+
+		long blockSize = (4095+sizeof(struct rleHeader) + sizeof(int) * 3 *distinct)/4096 * 4096;
+		char padding[4096];
+		int padSize = blockSize - sizeof(struct rleHeader) - sizeof(int) * 3* distinct;
+
+		memset(padding,0,sizeof(padding));
+		header.blockSize = blockSize;
+
+		write(outFd, &header, sizeof(struct columnHeader));
+		write(outFd, &rheader, sizeof(struct rleHeader));
+		write(outFd, dictValue, sizeof(int)*distinct);
+		write(outFd, dictCount, sizeof(int)*distinct);
+		write(outFd, dictPos, sizeof(int)*distinct);
+		write(outFd,padding,padSize);
+
+		close(outFd);
+		free(content);
+		free(dictValue);
+		free(dictPos);
+		free(dictCount);
+
+	}
+
+	return 0;
+
+}
diff --git a/data/ssb/loader/soa.c b/data/ssb/loader/soa.c
new file mode 100644
index 0000000..d9b617c
--- /dev/null
+++ b/data/ssb/loader/soa.c
@@ -0,0 +1,89 @@
+/*
+   Copyright (c) 2012-2013 The Ohio State University.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include "../include/common.h"
+
+/*
+ * Transform the column stored string from AOS to SOA.
+ * Currently the column must be stored in an uncompressed format.
+ */
+
+int main(int argc, char **argv){
+	int fd;
+	char * buf;
+
+	if (argc != 3){
+		printf("./soa columnName columnWidth\n");
+		exit(-1);
+	}
+
+	int columnWidth = atoi(argv[2]);
+
+	fd = open(argv[1], O_RDWR);
+	
+	struct columnHeader header;
+
+	read(fd, &header, sizeof(struct columnHeader));
+
+	if (header.format != UNCOMPRESSED){
+		printf("Not support uncompressed column yet!");
+		exit(-1);
+	}
+
+	int blockTotal = header.blockTotal;
+
+	long offset = 0;
+	long tupleOffset = 0;
+	for(int i=0;i<blockTotal;i++){
+		offset = i* sizeof(struct columnHeader) + tupleOffset * columnWidth;
+
+		lseek(fd, offset, SEEK_SET);
+
+		offset += sizeof(struct columnHeader);
+		read(fd, &header, sizeof(struct columnHeader));
+		int blockSize = header.blockSize;
+		int tupleNum = header.tupleNum;
+		buf = (char *) malloc(blockSize);
+
+		char  * tmp = (char *) malloc(columnWidth +1);
+		for(int j=0;j<tupleNum;j++){
+			memset(tmp, 0, columnWidth + 1);
+			read(fd,tmp, columnWidth);
+
+			for(int k=0;k<columnWidth;k++){
+				int pos = k*tupleNum + j; 
+				buf[pos] = tmp[k];
+			}
+		}
+
+		lseek(fd,offset,SEEK_SET);
+		write(fd,buf,blockSize);
+		free(tmp);
+		free(buf);
+		tupleOffset += tupleNum;
+	}
+
+	close(fd);
+	return 0;
+}
diff --git a/data/ssb/loader/sort.py b/data/ssb/loader/sort.py
new file mode 100644
index 0000000..e955b85
--- /dev/null
+++ b/data/ssb/loader/sort.py
@@ -0,0 +1,43 @@
+import duckdb
+import argparse
+
+
+CREATE_LINEORDER_TABLE = '''
+        CREATE TABLE lineorder (
+        LO_ORDERKEY UINT32, 
+        LO_LINENUMBER  UINT8,  
+        LO_CUSTKEY  UINT32,  
+        LO_PARTKEY  UINT32,
+        LO_SUPPKEY  UINT32, 
+        LO_ORDERDATE   INT32,  
+        LO_ORDERPRIORITY   string, 
+        LO_SHIPPRIORITY   UINT8,  
+        LO_QUANTITY   INT32,  
+        LO_EXTENDEDPRICE   INT32, 
+        LO_ORDTOTALPRICE   UINT32,  
+        LO_DISCOUNT   INT32,  
+        LO_REVENUE   UINT32, 
+        LO_SUPPLYCOST   UINT32, 
+        LO_TAX   UINT8,  
+        LO_COMMITDATE   UINT64, 
+        LO_SHIPMODE   string);
+        COPY lineorder FROM '{0}' WITH (HEADER false, DELIMITER '|');
+        '''
+
+SAVE_SORTED_ORDRERDATE_CUSTKEY = '''
+COPY (SELECT * FROM lineorder ORDER BY LO_ORDERDATE, LO_CUSTKEY ASC) TO '{0}' (HEADER false, DELIMITER '|');
+'''
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='convert')
+    parser.add_argument('data_directory', type=str, help='Data Directory')
+    args = parser.parse_args()
+
+    data_dir = args.data_directory
+    # process lineorder
+    input = data_dir + 'lineorder.tbl'
+    output = data_dir + 'lineorder.tbl.s'
+    con = duckdb.connect()
+    con.sql(CREATE_LINEORDER_TABLE.format(input))
+    con.sql(SAVE_SORTED_ORDRERDATE_CUSTKEY.format(output))
+    con.close()
diff --git a/data/ssb/loader/sort_other_way.py b/data/ssb/loader/sort_other_way.py
new file mode 100644
index 0000000..cfe35cd
--- /dev/null
+++ b/data/ssb/loader/sort_other_way.py
@@ -0,0 +1,43 @@
+import duckdb
+import argparse
+
+
+CREATE_LINEORDER_TABLE = '''
+        CREATE TABLE lineorder (
+        LO_ORDERKEY UINT32, 
+        LO_LINENUMBER  UINT8,  
+        LO_CUSTKEY  UINT32,  
+        LO_PARTKEY  UINT32,
+        LO_SUPPKEY  UINT32, 
+        LO_ORDERDATE   INT32,  
+        LO_ORDERPRIORITY   string, 
+        LO_SHIPPRIORITY   UINT8,  
+        LO_QUANTITY   INT32,  
+        LO_EXTENDEDPRICE   INT32, 
+        LO_ORDTOTALPRICE   UINT32,  
+        LO_DISCOUNT   INT32,  
+        LO_REVENUE   UINT32, 
+        LO_SUPPLYCOST   UINT32, 
+        LO_TAX   UINT8,  
+        LO_COMMITDATE   UINT64, 
+        LO_SHIPMODE   string);
+        COPY lineorder FROM '{0}' WITH (HEADER false, DELIMITER '|');
+        '''
+
+SAVE_SORTED_ORDRERDATE_CUSTKEY = '''
+COPY (SELECT * FROM lineorder ORDER BY LO_CUSTKEY, LO_ORDERDATE  ASC) TO '{0}' (HEADER false, DELIMITER '|');
+'''
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='convert')
+    parser.add_argument('data_directory', type=str, help='Data Directory')
+    args = parser.parse_args()
+
+    data_dir = args.data_directory
+    # process lineorder
+    input = data_dir + 'lineorder.tbl'
+    output = data_dir + 'lineorder.tbl.s'
+    con = duckdb.connect()
+    con.sql(CREATE_LINEORDER_TABLE.format(input))
+    con.sql(SAVE_SORTED_ORDRERDATE_CUSTKEY.format(output))
+    con.close()
diff --git a/data/ssb/queries/original/load.sql b/data/ssb/queries/original/load.sql
new file mode 100644
index 0000000..d18b2ea
--- /dev/null
+++ b/data/ssb/queries/original/load.sql
@@ -0,0 +1,5 @@
+copy customer from '/big_fast_drive/anil/dbops/test/ssb/data/s1/customer.tbl' delimiter '|';
+copy ddate from '/big_fast_drive/anil/dbops/test/ssb/data/s1/date.tbl' delimiter '|';
+copy lineorder from '/big_fast_drive/anil/dbops/test/ssb/data/s1/lineorder.tbl' delimiter '|';
+copy part from '/big_fast_drive/anil/dbops/test/ssb/data/s1/part.tbl' delimiter '|';
+copy supplier from '/big_fast_drive/anil/dbops/test/ssb/data/s1/supplier.tbl' delimiter '|';
diff --git a/data/ssb/queries/original/q11.sql b/data/ssb/queries/original/q11.sql
new file mode 100644
index 0000000..eef0175
--- /dev/null
+++ b/data/ssb/queries/original/q11.sql
@@ -0,0 +1,7 @@
+select sum(lo_extendedprice*lo_discount) as revenue
+from lineorder, ddate
+where lo_orderdate = d_datekey
+and d_year = 1993
+and lo_discount between 1 and 3
+and lo_quantity < 25
+
diff --git a/data/ssb/queries/original/q12.sql b/data/ssb/queries/original/q12.sql
new file mode 100644
index 0000000..71594e3
--- /dev/null
+++ b/data/ssb/queries/original/q12.sql
@@ -0,0 +1,7 @@
+select sum(lo_extendedprice*lo_discount) as revenue
+from lineorder, ddate
+where lo_orderdate = d_datekey
+and d_yearmonthnum = 199401
+and lo_discount between 4 and 6
+and lo_quantity between 26 and 35
+
diff --git a/data/ssb/queries/original/q13.sql b/data/ssb/queries/original/q13.sql
new file mode 100644
index 0000000..76bdd25
--- /dev/null
+++ b/data/ssb/queries/original/q13.sql
@@ -0,0 +1,8 @@
+select sum(lo_extendedprice*lo_discount) as revenue
+from lineorder, ddate
+where lo_orderdate = d_datekey
+and d_weeknuminyear = 6
+and d_year = 1994
+and lo_discount between 5 and 7
+and lo_quantity between 26 and 35
+
diff --git a/data/ssb/queries/original/q21.sql b/data/ssb/queries/original/q21.sql
new file mode 100644
index 0000000..08b6ce5
--- /dev/null
+++ b/data/ssb/queries/original/q21.sql
@@ -0,0 +1,10 @@
+select sum(lo_revenue), d_year, p_brand1
+from lineorder, ddate, part, supplier
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_category = 'MFGR#12'
+and s_region = 'AMERICA'
+group by d_year, p_brand1
+order by d_year, p_brand1
+
diff --git a/data/ssb/queries/original/q22.sql b/data/ssb/queries/original/q22.sql
new file mode 100644
index 0000000..fd8f88e
--- /dev/null
+++ b/data/ssb/queries/original/q22.sql
@@ -0,0 +1,11 @@
+select sum(lo_revenue), d_year, p_brand1
+from lineorder, ddate, part, supplier
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 between
+'MFGR#2221' and 'MFGR#2228'
+and s_region = 'ASIA'
+group by d_year, p_brand1
+order by d_year, p_brand1
+
diff --git a/data/ssb/queries/original/q23.sql b/data/ssb/queries/original/q23.sql
new file mode 100644
index 0000000..09298a1
--- /dev/null
+++ b/data/ssb/queries/original/q23.sql
@@ -0,0 +1,10 @@
+select sum(lo_revenue), d_year, p_brand1
+from lineorder, ddate, part, supplier
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 = 'MFGR#2221'
+and s_region = 'EUROPE'
+group by d_year, p_brand1
+order by d_year, p_brand1
+
diff --git a/data/ssb/queries/original/q31.sql b/data/ssb/queries/original/q31.sql
new file mode 100644
index 0000000..e1c68dc
--- /dev/null
+++ b/data/ssb/queries/original/q31.sql
@@ -0,0 +1,10 @@
+select c_nation, s_nation, d_year, sum(lo_revenue)
+as revenue from customer, lineorder, supplier, ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_region = 'ASIA' and s_region = 'ASIA'
+and d_year >= 1992 and d_year <= 1997
+group by c_nation, s_nation, d_year
+order by d_year asc, revenue desc
+
diff --git a/data/ssb/queries/original/q32.sql b/data/ssb/queries/original/q32.sql
new file mode 100644
index 0000000..6ef81b4
--- /dev/null
+++ b/data/ssb/queries/original/q32.sql
@@ -0,0 +1,11 @@
+select c_city, s_city, d_year, sum(lo_revenue) as revenue
+from customer, lineorder, supplier, ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_nation = 'UNITED STATES'
+and s_nation = 'UNITED STATES'
+and d_year >= 1992 and d_year <= 1997
+group by c_city, s_city, d_year
+order by d_year asc, revenue desc
+
diff --git a/data/ssb/queries/original/q33.sql b/data/ssb/queries/original/q33.sql
new file mode 100644
index 0000000..a68f030
--- /dev/null
+++ b/data/ssb/queries/original/q33.sql
@@ -0,0 +1,13 @@
+select c_city, s_city, d_year, sum(lo_revenue) as revenue
+from customer, lineorder, supplier, ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city='UNITED KI1'
+or c_city='UNITED KI5')
+and (s_city='UNITED KI1'
+or s_city='UNITED KI5')
+and d_year >= 1992 and d_year <= 1997
+group by c_city, s_city, d_year
+order by d_year asc, revenue desc
+
diff --git a/data/ssb/queries/original/q34.sql b/data/ssb/queries/original/q34.sql
new file mode 100644
index 0000000..dfd5a77
--- /dev/null
+++ b/data/ssb/queries/original/q34.sql
@@ -0,0 +1,13 @@
+select c_city, s_city, d_year, sum(lo_revenue) as revenue
+from customer, lineorder, supplier, ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city='UNITED KI1' or
+c_city='UNITED KI5')
+and (s_city='UNITED KI1' or
+s_city='UNITED KI5')
+and d_yearmonth = 'Dec1997'
+group by c_city, s_city, d_year
+order by d_year asc, revenue desc
+
diff --git a/data/ssb/queries/original/q41.sql b/data/ssb/queries/original/q41.sql
new file mode 100644
index 0000000..0c8f747
--- /dev/null
+++ b/data/ssb/queries/original/q41.sql
@@ -0,0 +1,12 @@
+select d_year, c_nation, sum(lo_revenue - lo_supplycost) as profit
+from ddate, customer, supplier, part, lineorder
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 'AMERICA'
+and s_region = 'AMERICA'
+and (p_mfgr = 'MFGR#1' or p_mfgr = 'MFGR#2')
+group by d_year, c_nation
+order by d_year, c_nation
+
diff --git a/data/ssb/queries/original/q42.sql b/data/ssb/queries/original/q42.sql
new file mode 100644
index 0000000..923e4d9
--- /dev/null
+++ b/data/ssb/queries/original/q42.sql
@@ -0,0 +1,14 @@
+select d_year, s_nation, p_category, sum(lo_revenue - lo_supplycost) as profit
+from ddate, customer, supplier, part, lineorder
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 'AMERICA'
+and s_region = 'AMERICA'
+and (d_year = 1997 or d_year = 1998)
+and (p_mfgr = 'MFGR#1'
+or p_mfgr = 'MFGR#2')
+group by d_year, s_nation, p_category
+order by d_year, s_nation, p_category
+
diff --git a/data/ssb/queries/original/q43.sql b/data/ssb/queries/original/q43.sql
new file mode 100644
index 0000000..8492176
--- /dev/null
+++ b/data/ssb/queries/original/q43.sql
@@ -0,0 +1,13 @@
+select d_year, s_city, p_brand1, sum(lo_revenue - lo_supplycost) as profit
+from ddate, customer, supplier, part, lineorder
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 'AMERICA'
+and s_nation = 'UNITED STATES'
+and (d_year = 1997 or d_year = 1998)
+and p_category = 'MFGR#14'
+group by d_year, s_city, p_brand1
+order by d_year, s_city, p_brand1
+
diff --git a/data/ssb/queries/original/schema.sql b/data/ssb/queries/original/schema.sql
new file mode 100644
index 0000000..9c54ddb
--- /dev/null
+++ b/data/ssb/queries/original/schema.sql
@@ -0,0 +1,77 @@
+create table lineorder (
+   lo_orderkey integer not null,
+   lo_linenumber integer not null,
+   lo_custkey integer not null,
+   lo_partkey integer not null,
+   lo_suppkey integer not null,
+   lo_orderdate integer not null,
+   lo_orderpriority char(15) not null,
+   lo_shippriority char(1) not null,
+   lo_quantity integer not null,
+   lo_extendedprice float not null,
+   lo_ordtotalprice float not null,
+   lo_discount float not null,
+   lo_revenue float not null,
+   lo_supplycost float not null,
+   lo_tax integer not null,
+   lo_commitdate integer not null,
+   lo_shopmode char(10) not null,
+   primary key (lo_orderkey,lo_linenumber)
+);
+
+create table part (
+   p_partkey integer not null,
+   p_name varchar(22) not null,
+   p_mfgr char(6) not null,
+   p_category char(7) not null,
+   p_brand1 char(9) not null,
+   p_color varchar(11) not null,
+   p_type varchar(25) not null,
+   p_size integer not null,
+   p_container char(10) not null,
+   primary key (p_partkey)
+);
+
+create table supplier (
+   s_suppkey integer not null,
+   s_name char(25) not null,
+   s_address varchar(25) not null,
+   s_city char(10) not null,
+   s_nation char(15) not null,
+   s_region char(12) not null,
+   s_phone char(15) not null,
+   primary key (s_suppkey)
+);
+
+create table customer (
+   c_custkey integer not null,
+   c_name varchar(25) not null,
+   c_address varchar(25) not null,
+   c_city char(10) not null,
+   c_nation char(15) not null,
+   c_region char(12) not null,
+   c_phone char(15) not null,
+   c_mktsegment char(10) not null,
+   primary key (c_custkey)
+);
+
+create table ddate (
+   d_datekey integer not null,
+   d_date char(18) not null,
+   d_dayofweek char(9) not null,
+   d_month char(9) not null,
+   d_year integer not null,
+   d_yearmonthnum integer not null,
+   d_yearmonth char(7) not null,
+   d_daynuminweek integer not null,
+   d_daynuminmonth integer not null,
+   d_daynuminyear integer not null,
+   d_monthnuminyear integer not null,
+   d_weeknuminyear integer not null,
+   d_sellingseasin varchar(12) not null,
+   d_lastdayinweekfl integer not null,
+   d_lastdayinmonthfl integer not null,
+   d_holidayfl integer not null,
+   d_weekdayfl integer not null,
+   primary key (d_datekey)
+);
diff --git a/data/ssb/queries/transformed/load.sql b/data/ssb/queries/transformed/load.sql
new file mode 100644
index 0000000..cd1d642
--- /dev/null
+++ b/data/ssb/queries/transformed/load.sql
@@ -0,0 +1,5 @@
+copy customer from '/big_fast_drive/anil/dbops/test/ssb/data/s1/customer.tbl.p' delimiter '|';
+copy ddate from '/big_fast_drive/anil/dbops/test/ssb/data/s1/date.tbl' delimiter '|';
+copy lineorder from '/big_fast_drive/anil/dbops/test/ssb/data/s1/lineorder.tbl' delimiter '|';
+copy part from '/big_fast_drive/anil/dbops/test/ssb/data/s1/part.tbl.p' delimiter '|';
+copy supplier from '/big_fast_drive/anil/dbops/test/ssb/data/s1/supplier.tbl.p' delimiter '|';
diff --git a/data/ssb/queries/transformed/p1.sql b/data/ssb/queries/transformed/p1.sql
new file mode 100644
index 0000000..e97b8cc
--- /dev/null
+++ b/data/ssb/queries/transformed/p1.sql
@@ -0,0 +1,10 @@
+select d_year,c_nation, COUNT(*) as profit
+from lineorder,supplier,customer,part,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 1
+and s_region = 1
+and (p_mfgr = 0 or p_mfgr = 1)
+group by d_year,c_nation;
diff --git a/data/ssb/queries/transformed/q11.sql b/data/ssb/queries/transformed/q11.sql
new file mode 100644
index 0000000..035a6cc
--- /dev/null
+++ b/data/ssb/queries/transformed/q11.sql
@@ -0,0 +1,6 @@
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19930101 and lo_orderdate <= 19940101 and lo_discount>=1
+and lo_discount<=3
+and lo_quantity<25;
+
diff --git a/data/ssb/queries/transformed/q12.sql b/data/ssb/queries/transformed/q12.sql
new file mode 100644
index 0000000..1f2fe71
--- /dev/null
+++ b/data/ssb/queries/transformed/q12.sql
@@ -0,0 +1,7 @@
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19940101 and lo_orderdate <= 19940131
+and lo_discount>=4 and lo_discount<=6
+and lo_quantity>=26
+and lo_quantity<=35;
+
diff --git a/data/ssb/queries/transformed/q13.sql b/data/ssb/queries/transformed/q13.sql
new file mode 100644
index 0000000..875351a
--- /dev/null
+++ b/data/ssb/queries/transformed/q13.sql
@@ -0,0 +1,9 @@
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19940204
+and lo_orderdate <= 19940210
+and lo_discount>=5
+and lo_discount<=7
+and lo_quantity>=26
+and lo_quantity<=35;
+
diff --git a/data/ssb/queries/transformed/q21.sql b/data/ssb/queries/transformed/q21.sql
new file mode 100644
index 0000000..d882016
--- /dev/null
+++ b/data/ssb/queries/transformed/q21.sql
@@ -0,0 +1,9 @@
+select sum(lo_revenue),d_year,p_brand1
+from lineorder,part,supplier,ddate
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_category = 1
+and s_region = 1
+group by d_year,p_brand1;
+
diff --git a/data/ssb/queries/transformed/q22.sql b/data/ssb/queries/transformed/q22.sql
new file mode 100644
index 0000000..17e02aa
--- /dev/null
+++ b/data/ssb/queries/transformed/q22.sql
@@ -0,0 +1,10 @@
+select sum(lo_revenue),d_year,p_brand1
+from lineorder, part, supplier,ddate
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 >= 260
+and p_brand1 <= 267
+and s_region = 2
+group by d_year,p_brand1;
+
diff --git a/data/ssb/queries/transformed/q23.sql b/data/ssb/queries/transformed/q23.sql
new file mode 100644
index 0000000..0e89828
--- /dev/null
+++ b/data/ssb/queries/transformed/q23.sql
@@ -0,0 +1,9 @@
+select sum(lo_revenue),d_year,p_brand1
+from lineorder,part,supplier,ddate
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_brand1 = 260
+and s_region = 3
+group by d_year,p_brand1;
+
diff --git a/data/ssb/queries/transformed/q31.sql b/data/ssb/queries/transformed/q31.sql
new file mode 100644
index 0000000..0085c62
--- /dev/null
+++ b/data/ssb/queries/transformed/q31.sql
@@ -0,0 +1,10 @@
+select c_nation,s_nation,d_year,sum(lo_revenue) as revenue
+from lineorder,customer, supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_region = 2
+and s_region = 2
+and d_year >= 1992 and d_year <= 1997
+group by c_nation,s_nation,d_year;
+
diff --git a/data/ssb/queries/transformed/q32.sql b/data/ssb/queries/transformed/q32.sql
new file mode 100644
index 0000000..2f16545
--- /dev/null
+++ b/data/ssb/queries/transformed/q32.sql
@@ -0,0 +1,10 @@
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and c_nation = 24
+and s_nation = 24
+and d_year >=1992 and d_year <= 1997
+group by c_city,s_city,d_year;
+
diff --git a/data/ssb/queries/transformed/q33.sql b/data/ssb/queries/transformed/q33.sql
new file mode 100644
index 0000000..367695c
--- /dev/null
+++ b/data/ssb/queries/transformed/q33.sql
@@ -0,0 +1,10 @@
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city = 231 or c_city = 235)
+and (s_city = 231 or s_city = 235)
+and d_year >=1992 and d_year <= 1997
+group by c_city,s_city,d_year;
+
diff --git a/data/ssb/queries/transformed/q34.sql b/data/ssb/queries/transformed/q34.sql
new file mode 100644
index 0000000..cf68e8d
--- /dev/null
+++ b/data/ssb/queries/transformed/q34.sql
@@ -0,0 +1,10 @@
+select c_city,s_city,d_year,sum(lo_revenue) as revenue
+from lineorder,customer,supplier,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_orderdate = d_datekey
+and (c_city = 231 or c_city = 235)
+and (s_city = 231 or s_city = 235)
+and d_yearmonthnum = 199712
+group by c_city,s_city,d_year;
+
diff --git a/data/ssb/queries/transformed/q41.sql b/data/ssb/queries/transformed/q41.sql
new file mode 100644
index 0000000..26f7768
--- /dev/null
+++ b/data/ssb/queries/transformed/q41.sql
@@ -0,0 +1,11 @@
+select d_year,c_nation,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,supplier,customer,part,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 1
+and s_region = 1
+and (p_mfgr = 0 or p_mfgr = 1)
+group by d_year,c_nation;
+
diff --git a/data/ssb/queries/transformed/q42.sql b/data/ssb/queries/transformed/q42.sql
new file mode 100644
index 0000000..b82ea33
--- /dev/null
+++ b/data/ssb/queries/transformed/q42.sql
@@ -0,0 +1,12 @@
+select d_year,s_nation,p_category,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,customer,supplier,part,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 1
+and s_region = 1
+and (d_year = 1997 or d_year = 1998)
+and (p_mfgr = 0 or p_mfgr = 1)
+group by d_year,s_nation, p_category;
+
diff --git a/data/ssb/queries/transformed/q43.sql b/data/ssb/queries/transformed/q43.sql
new file mode 100644
index 0000000..c9e5c4a
--- /dev/null
+++ b/data/ssb/queries/transformed/q43.sql
@@ -0,0 +1,12 @@
+select d_year,s_city,p_brand1,sum(lo_revenue-lo_supplycost) as profit
+from lineorder,supplier,customer,part,ddate
+where lo_custkey = c_custkey
+and lo_suppkey = s_suppkey
+and lo_partkey = p_partkey
+and lo_orderdate = d_datekey
+and c_region = 1
+and s_nation = 24
+and (d_year = 1997 or d_year = 1998)
+and p_category = 3
+group by d_year,s_city,p_brand1;
+
diff --git a/data/ssb/queries/transformed/schema.sql b/data/ssb/queries/transformed/schema.sql
new file mode 100644
index 0000000..d0f9695
--- /dev/null
+++ b/data/ssb/queries/transformed/schema.sql
@@ -0,0 +1,72 @@
+create table lineorder (
+   lo_orderkey integer not null,
+   lo_linenumber integer not null,
+   lo_custkey integer not null,
+   lo_partkey integer not null,
+   lo_suppkey integer not null,
+   lo_orderdate integer not null,
+   lo_orderpriority char(15) not null,
+   lo_shippriority char(1) not null,
+   lo_quantity integer not null,
+   lo_extendedprice float not null,
+   lo_ordtotalprice float not null,
+   lo_discount float not null,
+   lo_revenue float not null,
+   lo_supplycost float not null,
+   lo_tax integer not null,
+   lo_commitdate integer not null,
+   lo_shopmode char(10) not null
+);
+
+create table part (
+   p_partkey integer not null,
+   p_name varchar(22) not null,
+   p_mfgr integer not null,
+   p_category integer not null,
+   p_brand1 integer not null,
+   p_color varchar(11) not null,
+   p_type varchar(25) not null,
+   p_size integer not null,
+   p_container char(10) not null
+);
+
+create table supplier (
+   s_suppkey integer not null,
+   s_name char(25) not null,
+   s_address varchar(25) not null,
+   s_city integer not null,
+   s_nation integer not null,
+   s_region integer not null,
+   s_phone char(15) not null
+);
+
+create table customer (
+   c_custkey integer not null,
+   c_name varchar(25) not null,
+   c_address varchar(25) not null,
+   c_city integer not null,
+   c_nation integer not null,
+   c_region integer not null,
+   c_phone char(15) not null,
+   c_mktsegment char(10) not null
+);
+
+create table ddate (
+   d_datekey integer not null,
+   d_date char(18) not null,
+   d_dayofweek char(9) not null,
+   d_month char(9) not null,
+   d_year integer not null,
+   d_yearmonthnum integer not null,
+   d_yearmonth char(7) not null,
+   d_daynuminweek integer not null,
+   d_daynuminmonth integer not null,
+   d_daynuminyear integer not null,
+   d_monthnuminyear integer not null,
+   d_weeknuminyear integer not null,
+   d_sellingseasin varchar(12) not null,
+   d_lastdayinweekfl integer not null,
+   d_lastdayinmonthfl integer not null,
+   d_holidayfl integer not null,
+   d_weekdayfl integer not null
+);
diff --git a/data/ssb/queries/transformed/schema_no_pk.sql b/data/ssb/queries/transformed/schema_no_pk.sql
new file mode 100644
index 0000000..d0f9695
--- /dev/null
+++ b/data/ssb/queries/transformed/schema_no_pk.sql
@@ -0,0 +1,72 @@
+create table lineorder (
+   lo_orderkey integer not null,
+   lo_linenumber integer not null,
+   lo_custkey integer not null,
+   lo_partkey integer not null,
+   lo_suppkey integer not null,
+   lo_orderdate integer not null,
+   lo_orderpriority char(15) not null,
+   lo_shippriority char(1) not null,
+   lo_quantity integer not null,
+   lo_extendedprice float not null,
+   lo_ordtotalprice float not null,
+   lo_discount float not null,
+   lo_revenue float not null,
+   lo_supplycost float not null,
+   lo_tax integer not null,
+   lo_commitdate integer not null,
+   lo_shopmode char(10) not null
+);
+
+create table part (
+   p_partkey integer not null,
+   p_name varchar(22) not null,
+   p_mfgr integer not null,
+   p_category integer not null,
+   p_brand1 integer not null,
+   p_color varchar(11) not null,
+   p_type varchar(25) not null,
+   p_size integer not null,
+   p_container char(10) not null
+);
+
+create table supplier (
+   s_suppkey integer not null,
+   s_name char(25) not null,
+   s_address varchar(25) not null,
+   s_city integer not null,
+   s_nation integer not null,
+   s_region integer not null,
+   s_phone char(15) not null
+);
+
+create table customer (
+   c_custkey integer not null,
+   c_name varchar(25) not null,
+   c_address varchar(25) not null,
+   c_city integer not null,
+   c_nation integer not null,
+   c_region integer not null,
+   c_phone char(15) not null,
+   c_mktsegment char(10) not null
+);
+
+create table ddate (
+   d_datekey integer not null,
+   d_date char(18) not null,
+   d_dayofweek char(9) not null,
+   d_month char(9) not null,
+   d_year integer not null,
+   d_yearmonthnum integer not null,
+   d_yearmonth char(7) not null,
+   d_daynuminweek integer not null,
+   d_daynuminmonth integer not null,
+   d_daynuminyear integer not null,
+   d_monthnuminyear integer not null,
+   d_weeknuminyear integer not null,
+   d_sellingseasin varchar(12) not null,
+   d_lastdayinweekfl integer not null,
+   d_lastdayinmonthfl integer not null,
+   d_holidayfl integer not null,
+   d_weekdayfl integer not null
+);
diff --git a/data/util.py b/data/util.py
new file mode 100755
index 0000000..59bc558
--- /dev/null
+++ b/data/util.py
@@ -0,0 +1,80 @@
+import argparse
+import os
+
+
+class cd:
+    """Context manager for changing the current working directory"""
+
+    def __init__(self, newPath):
+        self.newPath = os.path.expanduser(newPath)
+
+    def __enter__(self):
+        self.savedPath = os.getcwd()
+        os.chdir(self.newPath)
+
+    def __exit__(self, etype, value, traceback):
+        os.chdir(self.savedPath)
+
+
+def gen_data(dataset, scale_factor):
+    path = './' + dataset + '/dbgen/'
+    with cd(path):
+        os.system('rm -rf *.tbl')
+        os.system('./dbgen -s %d -T a' % scale_factor)
+        os.system('mkdir -p ../data/s%d' % scale_factor)
+        os.system('mv *.tbl ../data/s%d/' % scale_factor)
+
+
+def transform(dataset, scale_factor):
+    path = './' + dataset + '/loader/'
+    ip = '../data/s%d/' % scale_factor
+    op = '../data/s%d_columnar/' % scale_factor
+    with cd(path):
+        os.system('mkdir -p %s' % op)
+        os.system('python3 convert.py ../data/s%d/' % scale_factor)
+        os.system(
+            './loader --lineorder %s/lineorder.tbl --ddate %s/date.tbl --customer %s/customer.tbl.p --supplier %s/supplier.tbl.p --part %s/part.tbl.p --datadir %s' % (
+                ip, ip, ip, ip, ip, op))
+
+
+def sort(dataset, scale_factor):
+    path = './' + dataset + '/loader/'
+    ip = '../data/s%d/' % scale_factor
+    op = '../data/s%d_columnar_sorted/' % scale_factor
+    with cd(path):
+        os.system('mkdir -p %s' % op)
+        os.system('python3 convert.py ../data/s%d/' % scale_factor)
+        os.system('python3 sort.py ../data/s%d/' % scale_factor)
+        os.system(
+            './loader --lineorder %s/lineorder.tbl.s --ddate %s/date.tbl --customer %s/customer.tbl.p --supplier %s/supplier.tbl.p --part %s/part.tbl.p --datadir %s' % (
+                ip, ip, ip, ip, ip, op))
+
+
+def sort_other_way(dataset, scale_factor):
+    path = './' + dataset + '/loader/'
+    ip = '../data/s%d/' % scale_factor
+    op = '../data/s%d_columnar_sorted_other_way/' % scale_factor
+    with cd(path):
+        os.system('mkdir -p %s' % op)
+        os.system('python3 convert.py ../data/s%d/' % scale_factor)
+        os.system('python3 sort_other_way.py ../data/s%d/' % scale_factor)
+        os.system(
+            './loader --lineorder %s/lineorder.tbl.s --ddate %s/date.tbl --customer %s/customer.tbl.p --supplier %s/supplier.tbl.p --part %s/part.tbl.p --datadir %s' % (
+                ip, ip, ip, ip, ip, op))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='data gen')
+    parser.add_argument('dataset', type=str, choices=['ssb'])
+    parser.add_argument('scale_factor', type=int)
+    parser.add_argument('action', type=str, choices=['gen', 'transform', 'sort', 'sort_other_way'])
+    args = parser.parse_args()
+
+    if args.action == 'gen':
+        gen_data(args.dataset, args.scale_factor)
+    elif args.action == 'transform':
+        transform(args.dataset, args.scale_factor)
+    elif args.action == 'sort':
+        sort(args.dataset, args.scale_factor)
+    elif args.action == 'sort_other_way':
+        sort_other_way(args.dataset, args.scale_factor)
diff --git a/fastlanes/CMakeLists.txt b/fastlanes/CMakeLists.txt
new file mode 100644
index 0000000..06b89c9
--- /dev/null
+++ b/fastlanes/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Generated Code : -----------------------------------------------------------------------------------------------------
+add_subdirectory(generated)
+
+# Source : -------------------------------------------------------------------------------------------------------------
+add_subdirectory(src)
+
+# Example: -------------------------------------------------------------------------------------------------------------
+add_subdirectory(example)
diff --git a/fastlanes/example/CMakeLists.txt b/fastlanes/example/CMakeLists.txt
new file mode 100644
index 0000000..bf59ccd
--- /dev/null
+++ b/fastlanes/example/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(fastlanes_bench_bitpack fastlanes_bench_bitpack.cu)
+target_link_libraries(fastlanes_bench_bitpack PUBLIC fastlanes_gpu)
+
+
+add_executable(fastlanes_bench_delta fastlanes_bench_delta.cu)
+target_link_libraries(fastlanes_bench_delta PUBLIC fastlanes_gpu)
diff --git a/fastlanes/example/fastlanes_bench_bitpack.cu b/fastlanes/example/fastlanes_bench_bitpack.cu
new file mode 100644
index 0000000..001d5af
--- /dev/null
+++ b/fastlanes/example/fastlanes_bench_bitpack.cu
@@ -0,0 +1,86 @@
+#include "fastlanes.cuh"
+#include "debug.hpp"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/unpack/unpack.cuh"
+#include <iostream>
+
+int main() {
+	auto bitwidth = 13;
+
+	/* Init */
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Init :  \n";
+	cudaDeviceSynchronize();
+
+	const uint64_t warp_sz         = 32;
+	const uint64_t n_vec           = 256 * 1024;
+	const uint64_t vec_sz          = 1024;
+	const uint64_t n_tup           = vec_sz * n_vec;
+	const uint64_t v_blc_sz        = 1;
+	const uint64_t n_blc           = n_vec / v_blc_sz;
+	const uint64_t n_trd           = v_blc_sz * warp_sz;
+	auto*          h_org_arr       = new uint32_t[n_tup];
+	auto*          h_encoded_data  = new uint32_t[n_tup];
+	uint64_t       encoded_arr_bsz = n_tup * sizeof(int);
+	uint32_t*      d_decoded_arr   = nullptr;
+	auto*          h_decoded_arr   = new uint32_t[n_tup];
+	CUDA_SAFE_CALL(cudaMalloc((void**)&d_decoded_arr, sizeof(uint32_t) * n_tup));
+	uint32_t mask = (1 << bitwidth) - 1;
+
+	static_assert(n_tup % n_trd == 0, "");
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Generate : \n";
+
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = 5 & mask;
+	}
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Encode :  \n";
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Load encoded data into GPU : \n";
+
+	auto* d_encoded_arr = fastlanes::gpu::load_arr(h_encoded_data, encoded_arr_bsz);
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Decode : \n";
+
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Copy data to host :  \n";
+
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Test :  \n";
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << fastlanes::debug::red << "-- ERROR: idx | " << i << " : " << h_org_arr[i]
+			          << " != " << h_decoded_arr[i] << fastlanes::debug::def << '\n';
+			return -1;
+		}
+	}
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+
+}
diff --git a/fastlanes/example/fastlanes_bench_delta.cu b/fastlanes/example/fastlanes_bench_delta.cu
new file mode 100644
index 0000000..13e3a48
--- /dev/null
+++ b/fastlanes/example/fastlanes_bench_delta.cu
@@ -0,0 +1,194 @@
+#include "fastlanes.cuh"
+#include "debug.hpp"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/rsum/rsum.cuh"
+#include "fls_gen/transpose/transpose.hpp"
+#include "fls_gen/unrsum/unrsum.hpp"
+#include <cstring>
+
+__global__  void bfr_3bw_32ow_32crw_1uf_krl_v0(uint32_t* in, uint32_t* out, uint32_t* base) {
+	uint32_t trd_idx = threadIdx.x;
+	uint32_t blc_idx = blockIdx.x;
+	in               = in + ((blc_idx * 3) << 5);
+	out              = out + (blc_idx << 10);
+	trd_idx          = trd_idx % 32;
+	uint32_t r_0;
+	uint32_t r_1;
+
+	__shared__ uint32_t sm_arr[1024];
+
+	r_0                        = *(in + (0 * 32) + (trd_idx * 1) + 0);
+	r_1                        = (r_0) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (0 * 32)] = r_1;
+	r_1                        = (r_0 >> 3) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (1 * 32)] = r_1;
+	r_1                        = (r_0 >> 6) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (2 * 32)] = r_1;
+	r_1                        = (r_0 >> 9) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (3 * 32)] = r_1;
+	r_1                        = (r_0 >> 12) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (4 * 32)] = r_1;
+	r_1                        = (r_0 >> 15) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (5 * 32)] = r_1;
+	r_1                        = (r_0 >> 18) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (6 * 32)] = r_1;
+	r_1                        = (r_0 >> 21) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (7 * 32)] = r_1;
+	r_1                        = (r_0 >> 24) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (8 * 32)] = r_1;
+	r_1                        = (r_0 >> 27) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (9 * 32)] = r_1;
+	r_1                        = (r_0 >> 30) & ((1ULL << 2) - 1);
+	r_0                        = *(in + (0 * 32) + (trd_idx * 1) + 32);
+	r_1 |= ((r_0) & ((1ULL << 1) - 1)) << 2;
+	sm_arr[trd_idx + (10 * 32)] = r_1;
+	r_1                         = (r_0 >> 1) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (11 * 32)] = r_1;
+	r_1                         = (r_0 >> 4) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (12 * 32)] = r_1;
+	r_1                         = (r_0 >> 7) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (13 * 32)] = r_1;
+	r_1                         = (r_0 >> 10) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (14 * 32)] = r_1;
+	r_1                         = (r_0 >> 13) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (15 * 32)] = r_1;
+	r_1                         = (r_0 >> 16) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (16 * 32)] = r_1;
+	r_1                         = (r_0 >> 19) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (17 * 32)] = r_1;
+	r_1                         = (r_0 >> 22) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (18 * 32)] = r_1;
+	r_1                         = (r_0 >> 25) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (19 * 32)] = r_1;
+	r_1                         = (r_0 >> 28) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (20 * 32)] = r_1;
+	r_1                         = (r_0 >> 31) & ((1ULL << 1) - 1);
+	r_0                         = *(in + (0 * 32) + (trd_idx * 1) + 64);
+	r_1 |= ((r_0) & ((1ULL << 2) - 1)) << 1;
+	sm_arr[trd_idx + (21 * 32)] = r_1;
+	r_1                         = (r_0 >> 2) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (22 * 32)] = r_1;
+	r_1                         = (r_0 >> 5) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (23 * 32)] = r_1;
+	r_1                         = (r_0 >> 8) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (24 * 32)] = r_1;
+	r_1                         = (r_0 >> 11) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (25 * 32)] = r_1;
+	r_1                         = (r_0 >> 14) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (26 * 32)] = r_1;
+	r_1                         = (r_0 >> 17) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (27 * 32)] = r_1;
+	r_1                         = (r_0 >> 20) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (28 * 32)] = r_1;
+	r_1                         = (r_0 >> 23) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (29 * 32)] = r_1;
+	r_1                         = (r_0 >> 26) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (30 * 32)] = r_1;
+	r_1                         = (r_0 >> 29) & ((1ULL << 3) - 1);
+	sm_arr[trd_idx + (31 * 32)] = r_1;
+
+	d_rsum_32(sm_arr, out, base);
+}
+
+int main() {
+
+	/* Init */
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Init :  \n";
+	cudaDeviceSynchronize();
+
+	const uint64_t warp_sz          = 32;
+	// const uint64_t num_trials       = 5;
+	const uint64_t n_vec            = 256 * 1024;
+	const uint64_t vec_sz           = 1024;
+	const uint64_t n_tup            = vec_sz * n_vec;
+	const uint64_t v_blc_sz         = 1;
+	const uint64_t n_blc            = n_vec / v_blc_sz;
+	const uint64_t n_trd            = v_blc_sz * warp_sz;
+	auto*          h_org_arr        = new uint32_t[n_tup];
+	auto*          h_encoded_data   = new uint32_t[n_tup];
+	auto*          h_decoded_arr    = new uint32_t[n_tup];
+	auto*          h_transposed_arr = new uint32_t[vec_sz];
+	auto*          h_unrsummed_arr  = new uint32_t[vec_sz];
+	auto*          h_base_arr       = new uint32_t[32 * n_vec];
+	uint64_t       encoded_arr_bsz  = n_tup * sizeof(int);
+	uint32_t*      d_base_arr       = nullptr;
+	uint32_t*      d_decoded_arr    = nullptr;
+	uint32_t*      d_encoded_arr    = nullptr;
+	uint8_t        num_bits         = 3;
+
+	CUDA_SAFE_CALL(cudaMalloc((void**)&d_decoded_arr, sizeof(uint32_t) * n_tup));
+
+	static_assert(n_tup % n_trd == 0, "");
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Generate : \n";
+	FLS_SHOW(n_vec)
+	FLS_SHOW(n_tup)
+
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		//		h_org_arr[i] = i * tile_based::delta;
+		h_org_arr[i] = i % 1024;
+	}
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Encode :  \n";
+
+	auto in_als   = h_org_arr;
+	auto out_als  = h_encoded_data;
+	auto base_als = h_base_arr;
+
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::transpose::fallback::scalar::transpose_i(in_als, h_transposed_arr);
+
+		generated::unrsum::fallback::scalar::unrsum(h_transposed_arr, h_unrsummed_arr);
+
+		std::memcpy(base_als, h_transposed_arr, sizeof(uint32_t) * 32);
+
+		generated::pack::fallback::scalar::pack(h_unrsummed_arr, out_als, num_bits);
+
+		in_als   = in_als + vec_sz;
+		out_als  = out_als + (num_bits * vec_sz / 32);
+		base_als = base_als + 32;
+	}
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Load encoded data into GPU : \n";
+
+	d_encoded_arr = fastlanes::gpu::load_arr(h_encoded_data, encoded_arr_bsz);
+	d_base_arr    = fastlanes::gpu::load_arr(h_base_arr, 32 * n_vec * sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Decode : \n";
+
+	bfr_3bw_32ow_32crw_1uf_krl_v0<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, d_base_arr);
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Copy data to host :  \n";
+
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Test :  \n";
+
+	for (uint64_t vec_idx = 0, idx = 0; vec_idx < n_vec; ++vec_idx) {
+		for (; idx < n_tup; idx++) {
+			if (h_transposed_arr[idx % 1024] != h_decoded_arr[idx]) {
+				std::cout << fastlanes::debug::red << "-- ERROR: idx | " << idx << " : " << h_org_arr[idx]
+				          << " != " << h_decoded_arr[idx] << fastlanes::debug::def << '\n';
+				return -1;
+			}
+		}
+	}
+
+	std::cout << fastlanes::debug::green << "-- successful ! " << fastlanes::debug::def << '\n';
+}
diff --git a/fastlanes/generate.py b/fastlanes/generate.py
new file mode 100644
index 0000000..a0fdd75
--- /dev/null
+++ b/fastlanes/generate.py
@@ -0,0 +1,15 @@
+from fls_gen.generate_bitpack_lib import *
+from fls_gen.tools import *
+
+
+def main():
+    dir_path = "./generated/"
+
+    creat_if_not_exist(dir_path)
+    clear_prev_generation()
+    generate_bitpack_lib()
+    clang_format()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fastlanes/generated/CMakeLists.txt b/fastlanes/generated/CMakeLists.txt
new file mode 100644
index 0000000..424be87
--- /dev/null
+++ b/fastlanes/generated/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(cuda)
\ No newline at end of file
diff --git a/fastlanes/generated/cuda/CMakeLists.txt b/fastlanes/generated/cuda/CMakeLists.txt
new file mode 100644
index 0000000..73384d5
--- /dev/null
+++ b/fastlanes/generated/cuda/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(normal_t32_uf1)
+add_subdirectory(fused_t32_uf1)
diff --git a/fastlanes/generated/cuda/fused_t32_uf1/CMakeLists.txt b/fastlanes/generated/cuda/fused_t32_uf1/CMakeLists.txt
new file mode 100644
index 0000000..bffbf54
--- /dev/null
+++ b/fastlanes/generated/cuda/fused_t32_uf1/CMakeLists.txt
@@ -0,0 +1,38 @@
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake")
+    include(${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake)
+else()
+endif()
+set(FLS_GENERATED_OBJECT_FILES
+        ${FLS_GENERATED_OBJECT_FILES} PARENT_SCOPE)
diff --git a/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_bench.cu b/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_bench.cu
new file mode 100644
index 0000000..79d79f8
--- /dev/null
+++ b/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_bench.cu
@@ -0,0 +1,1047 @@
+// generated!
+
+#include "fastlanes.cuh"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/unpack/unpack_fused.cuh"
+#include <iostream>
+
+const uint64_t warp_sz         = 32;
+const uint64_t n_vec           = 256 * 1024;
+const uint64_t vec_sz          = 1024;
+const uint64_t n_tup           = vec_sz * n_vec;
+const uint64_t v_blc_sz        = 1;
+const uint64_t n_blc           = n_vec / v_blc_sz;
+const uint64_t n_trd           = v_blc_sz * warp_sz;
+auto*          h_org_arr       = new uint32_t[n_tup];
+auto*          h_encoded_data  = new uint32_t[n_tup];
+uint64_t       encoded_arr_bsz = n_tup * sizeof(int);
+uint32_t*      d_decoded_arr   = nullptr;
+auto*          h_decoded_arr   = new uint32_t[n_tup];
+
+static void bench0_unpack_0bw_32ow_32crw_1uf() {
+	auto bitwidth = 0;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench1_unpack_1bw_32ow_32crw_1uf() {
+	auto bitwidth = 1;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench2_unpack_2bw_32ow_32crw_1uf() {
+	auto bitwidth = 2;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench3_unpack_3bw_32ow_32crw_1uf() {
+	auto bitwidth = 3;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench4_unpack_4bw_32ow_32crw_1uf() {
+	auto bitwidth = 4;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench5_unpack_5bw_32ow_32crw_1uf() {
+	auto bitwidth = 5;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench6_unpack_6bw_32ow_32crw_1uf() {
+	auto bitwidth = 6;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench7_unpack_7bw_32ow_32crw_1uf() {
+	auto bitwidth = 7;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench8_unpack_8bw_32ow_32crw_1uf() {
+	auto bitwidth = 8;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench9_unpack_9bw_32ow_32crw_1uf() {
+	auto bitwidth = 9;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench10_unpack_10bw_32ow_32crw_1uf() {
+	auto bitwidth = 10;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench11_unpack_11bw_32ow_32crw_1uf() {
+	auto bitwidth = 11;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench12_unpack_12bw_32ow_32crw_1uf() {
+	auto bitwidth = 12;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench13_unpack_13bw_32ow_32crw_1uf() {
+	auto bitwidth = 13;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench14_unpack_14bw_32ow_32crw_1uf() {
+	auto bitwidth = 14;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench15_unpack_15bw_32ow_32crw_1uf() {
+	auto bitwidth = 15;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench16_unpack_16bw_32ow_32crw_1uf() {
+	auto bitwidth = 16;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench17_unpack_17bw_32ow_32crw_1uf() {
+	auto bitwidth = 17;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench18_unpack_18bw_32ow_32crw_1uf() {
+	auto bitwidth = 18;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench19_unpack_19bw_32ow_32crw_1uf() {
+	auto bitwidth = 19;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench20_unpack_20bw_32ow_32crw_1uf() {
+	auto bitwidth = 20;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench21_unpack_21bw_32ow_32crw_1uf() {
+	auto bitwidth = 21;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench22_unpack_22bw_32ow_32crw_1uf() {
+	auto bitwidth = 22;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench23_unpack_23bw_32ow_32crw_1uf() {
+	auto bitwidth = 23;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench24_unpack_24bw_32ow_32crw_1uf() {
+	auto bitwidth = 24;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench25_unpack_25bw_32ow_32crw_1uf() {
+	auto bitwidth = 25;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench26_unpack_26bw_32ow_32crw_1uf() {
+	auto bitwidth = 26;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench27_unpack_27bw_32ow_32crw_1uf() {
+	auto bitwidth = 27;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench28_unpack_28bw_32ow_32crw_1uf() {
+	auto bitwidth = 28;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench29_unpack_29bw_32ow_32crw_1uf() {
+	auto bitwidth = 29;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench30_unpack_30bw_32ow_32crw_1uf() {
+	auto bitwidth = 30;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench31_unpack_31bw_32ow_32crw_1uf() {
+	auto bitwidth = 31;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench32_unpack_32bw_32ow_32crw_1uf() {
+	auto bitwidth = 32;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+void benchmark_all() {
+	CUDA_SAFE_CALL(cudaMalloc((void**)&d_decoded_arr, sizeof(uint32_t) * n_tup));
+	bench0_unpack_0bw_32ow_32crw_1uf();
+	bench1_unpack_1bw_32ow_32crw_1uf();
+	bench2_unpack_2bw_32ow_32crw_1uf();
+	bench3_unpack_3bw_32ow_32crw_1uf();
+	bench4_unpack_4bw_32ow_32crw_1uf();
+	bench5_unpack_5bw_32ow_32crw_1uf();
+	bench6_unpack_6bw_32ow_32crw_1uf();
+	bench7_unpack_7bw_32ow_32crw_1uf();
+	bench8_unpack_8bw_32ow_32crw_1uf();
+	bench9_unpack_9bw_32ow_32crw_1uf();
+	bench10_unpack_10bw_32ow_32crw_1uf();
+	bench11_unpack_11bw_32ow_32crw_1uf();
+	bench12_unpack_12bw_32ow_32crw_1uf();
+	bench13_unpack_13bw_32ow_32crw_1uf();
+	bench14_unpack_14bw_32ow_32crw_1uf();
+	bench15_unpack_15bw_32ow_32crw_1uf();
+	bench16_unpack_16bw_32ow_32crw_1uf();
+	bench17_unpack_17bw_32ow_32crw_1uf();
+	bench18_unpack_18bw_32ow_32crw_1uf();
+	bench19_unpack_19bw_32ow_32crw_1uf();
+	bench20_unpack_20bw_32ow_32crw_1uf();
+	bench21_unpack_21bw_32ow_32crw_1uf();
+	bench22_unpack_22bw_32ow_32crw_1uf();
+	bench23_unpack_23bw_32ow_32crw_1uf();
+	bench24_unpack_24bw_32ow_32crw_1uf();
+	bench25_unpack_25bw_32ow_32crw_1uf();
+	bench26_unpack_26bw_32ow_32crw_1uf();
+	bench27_unpack_27bw_32ow_32crw_1uf();
+	bench28_unpack_28bw_32ow_32crw_1uf();
+	bench29_unpack_29bw_32ow_32crw_1uf();
+	bench30_unpack_30bw_32ow_32crw_1uf();
+	bench31_unpack_31bw_32ow_32crw_1uf();
+	bench32_unpack_32bw_32ow_32crw_1uf();
+}
+int main() { benchmark_all(); }
diff --git a/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_helper.hpp b/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_helper.hpp
new file mode 100644
index 0000000..588df7e
--- /dev/null
+++ b/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_helper.hpp
@@ -0,0 +1,2894 @@
+// generated!
+#include "fls_gen/unpack/unpack.hpp"
+namespace helper {
+uint32_t rand_arr_0_b0_w32_arr[1024] = {
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+};
+uint32_t rand_arr_1_b1_w32_arr[1024] = {
+    1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL,
+    0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL,
+    0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL,
+    0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL,
+    1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL,
+    1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL,
+    0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL,
+    0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL,
+    1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL,
+    1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL,
+    0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL,
+    1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL,
+    1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL,
+    0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL,
+    1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL,
+    1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL,
+    1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL,
+    0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL,
+    1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL,
+    1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL,
+    1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL,
+    1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL,
+    0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL,
+    1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL,
+    1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL,
+    1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL,
+};
+uint32_t rand_arr_2_b2_w32_arr[1024] = {
+    1UL, 1UL, 0UL, 1UL, 2UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 2UL, 2UL, 3UL, 3UL, 2UL, 3UL, 0UL, 2UL, 2UL, 3UL,
+    2UL, 0UL, 1UL, 0UL, 0UL, 3UL, 2UL, 3UL, 3UL, 3UL, 2UL, 1UL, 3UL, 1UL, 2UL, 1UL, 1UL, 0UL, 2UL, 2UL, 0UL, 0UL, 3UL,
+    2UL, 1UL, 3UL, 2UL, 1UL, 0UL, 3UL, 3UL, 2UL, 2UL, 3UL, 1UL, 3UL, 2UL, 1UL, 2UL, 3UL, 0UL, 0UL, 1UL, 1UL, 1UL, 2UL,
+    1UL, 2UL, 1UL, 3UL, 3UL, 0UL, 1UL, 1UL, 2UL, 0UL, 2UL, 0UL, 2UL, 3UL, 1UL, 2UL, 3UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL,
+    2UL, 2UL, 3UL, 0UL, 2UL, 1UL, 0UL, 0UL, 2UL, 3UL, 1UL, 3UL, 0UL, 2UL, 2UL, 0UL, 0UL, 2UL, 3UL, 3UL, 0UL, 2UL, 2UL,
+    2UL, 0UL, 0UL, 0UL, 0UL, 2UL, 1UL, 3UL, 2UL, 0UL, 2UL, 3UL, 2UL, 1UL, 2UL, 2UL, 2UL, 2UL, 0UL, 1UL, 3UL, 2UL, 0UL,
+    2UL, 0UL, 3UL, 3UL, 3UL, 1UL, 3UL, 3UL, 1UL, 0UL, 1UL, 1UL, 1UL, 3UL, 2UL, 0UL, 1UL, 3UL, 3UL, 3UL, 2UL, 0UL, 1UL,
+    3UL, 0UL, 1UL, 3UL, 0UL, 0UL, 2UL, 0UL, 3UL, 0UL, 0UL, 3UL, 1UL, 0UL, 3UL, 0UL, 1UL, 1UL, 2UL, 2UL, 1UL, 1UL, 0UL,
+    3UL, 1UL, 2UL, 0UL, 0UL, 3UL, 2UL, 1UL, 1UL, 1UL, 1UL, 3UL, 0UL, 3UL, 0UL, 0UL, 1UL, 2UL, 0UL, 2UL, 0UL, 1UL, 2UL,
+    3UL, 2UL, 0UL, 2UL, 1UL, 0UL, 3UL, 1UL, 1UL, 2UL, 3UL, 3UL, 0UL, 2UL, 1UL, 3UL, 2UL, 1UL, 2UL, 0UL, 3UL, 0UL, 0UL,
+    3UL, 2UL, 1UL, 2UL, 0UL, 0UL, 0UL, 1UL, 2UL, 1UL, 3UL, 2UL, 1UL, 2UL, 2UL, 2UL, 1UL, 2UL, 0UL, 1UL, 1UL, 0UL, 0UL,
+    0UL, 3UL, 2UL, 1UL, 3UL, 3UL, 1UL, 1UL, 3UL, 3UL, 2UL, 3UL, 1UL, 2UL, 2UL, 3UL, 1UL, 1UL, 2UL, 1UL, 0UL, 0UL, 1UL,
+    0UL, 2UL, 3UL, 3UL, 2UL, 3UL, 3UL, 0UL, 0UL, 2UL, 0UL, 2UL, 3UL, 2UL, 2UL, 3UL, 0UL, 0UL, 2UL, 1UL, 0UL, 0UL, 2UL,
+    0UL, 1UL, 3UL, 0UL, 0UL, 3UL, 1UL, 2UL, 3UL, 2UL, 0UL, 2UL, 2UL, 3UL, 3UL, 3UL, 2UL, 0UL, 1UL, 2UL, 1UL, 3UL, 3UL,
+    3UL, 1UL, 3UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 3UL, 2UL, 0UL, 1UL, 0UL, 1UL, 3UL, 2UL, 2UL, 1UL, 2UL,
+    0UL, 3UL, 1UL, 1UL, 2UL, 0UL, 2UL, 2UL, 0UL, 3UL, 0UL, 2UL, 1UL, 1UL, 2UL, 3UL, 0UL, 3UL, 0UL, 3UL, 3UL, 1UL, 2UL,
+    2UL, 0UL, 2UL, 0UL, 1UL, 0UL, 0UL, 0UL, 3UL, 3UL, 1UL, 2UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 3UL, 2UL, 1UL, 2UL, 1UL,
+    1UL, 2UL, 0UL, 1UL, 2UL, 3UL, 1UL, 3UL, 2UL, 3UL, 0UL, 2UL, 3UL, 1UL, 2UL, 2UL, 2UL, 2UL, 2UL, 3UL, 0UL, 1UL, 1UL,
+    3UL, 3UL, 2UL, 2UL, 2UL, 2UL, 1UL, 3UL, 0UL, 2UL, 0UL, 3UL, 1UL, 2UL, 1UL, 3UL, 0UL, 0UL, 3UL, 3UL, 1UL, 1UL, 3UL,
+    3UL, 3UL, 1UL, 1UL, 2UL, 2UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 3UL, 0UL, 3UL, 1UL, 0UL, 0UL, 2UL, 3UL, 2UL, 2UL, 1UL,
+    1UL, 0UL, 3UL, 1UL, 1UL, 2UL, 2UL, 3UL, 3UL, 2UL, 0UL, 2UL, 1UL, 0UL, 3UL, 0UL, 0UL, 0UL, 1UL, 0UL, 3UL, 0UL, 1UL,
+    2UL, 2UL, 2UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 2UL, 0UL, 2UL, 0UL, 1UL, 1UL, 2UL, 3UL, 1UL, 3UL, 2UL, 0UL, 2UL,
+    3UL, 1UL, 0UL, 0UL, 3UL, 1UL, 1UL, 1UL, 0UL, 3UL, 2UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 2UL, 0UL, 3UL, 0UL, 1UL, 3UL,
+    1UL, 2UL, 0UL, 2UL, 0UL, 0UL, 0UL, 2UL, 3UL, 2UL, 1UL, 0UL, 3UL, 1UL, 0UL, 0UL, 1UL, 2UL, 2UL, 2UL, 0UL, 1UL, 0UL,
+    2UL, 0UL, 3UL, 2UL, 3UL, 3UL, 2UL, 0UL, 1UL, 2UL, 2UL, 0UL, 2UL, 3UL, 3UL, 3UL, 2UL, 0UL, 0UL, 0UL, 0UL, 2UL, 2UL,
+    1UL, 2UL, 1UL, 3UL, 2UL, 0UL, 0UL, 2UL, 0UL, 2UL, 0UL, 1UL, 2UL, 0UL, 3UL, 1UL, 3UL, 0UL, 3UL, 0UL, 3UL, 0UL, 0UL,
+    1UL, 1UL, 3UL, 2UL, 0UL, 2UL, 1UL, 2UL, 3UL, 1UL, 2UL, 3UL, 2UL, 1UL, 2UL, 3UL, 2UL, 0UL, 2UL, 1UL, 2UL, 2UL, 2UL,
+    3UL, 2UL, 0UL, 3UL, 0UL, 3UL, 0UL, 1UL, 2UL, 2UL, 3UL, 3UL, 1UL, 1UL, 1UL, 3UL, 0UL, 1UL, 3UL, 0UL, 1UL, 0UL, 0UL,
+    1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 3UL, 3UL, 1UL, 3UL, 0UL, 0UL, 2UL, 0UL, 2UL, 2UL, 1UL, 1UL, 0UL, 0UL, 2UL, 1UL,
+    3UL, 0UL, 1UL, 3UL, 2UL, 3UL, 3UL, 2UL, 2UL, 1UL, 2UL, 3UL, 1UL, 3UL, 1UL, 3UL, 2UL, 1UL, 0UL, 3UL, 2UL, 3UL, 3UL,
+    1UL, 0UL, 1UL, 0UL, 3UL, 2UL, 0UL, 1UL, 2UL, 3UL, 0UL, 0UL, 1UL, 2UL, 1UL, 2UL, 3UL, 0UL, 3UL, 3UL, 1UL, 1UL, 1UL,
+    3UL, 3UL, 2UL, 0UL, 3UL, 2UL, 2UL, 2UL, 1UL, 2UL, 2UL, 0UL, 0UL, 1UL, 1UL, 3UL, 0UL, 1UL, 2UL, 2UL, 3UL, 1UL, 3UL,
+    1UL, 1UL, 0UL, 2UL, 3UL, 1UL, 2UL, 0UL, 2UL, 1UL, 3UL, 1UL, 2UL, 2UL, 2UL, 2UL, 0UL, 2UL, 1UL, 3UL, 2UL, 1UL, 0UL,
+    1UL, 0UL, 0UL, 2UL, 3UL, 2UL, 0UL, 0UL, 2UL, 2UL, 0UL, 3UL, 0UL, 3UL, 2UL, 0UL, 2UL, 3UL, 1UL, 0UL, 1UL, 0UL, 0UL,
+    1UL, 2UL, 3UL, 1UL, 0UL, 3UL, 1UL, 3UL, 3UL, 1UL, 3UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 3UL, 2UL, 1UL, 3UL, 3UL,
+    3UL, 0UL, 1UL, 2UL, 2UL, 0UL, 3UL, 2UL, 1UL, 3UL, 3UL, 2UL, 3UL, 3UL, 1UL, 0UL, 2UL, 0UL, 0UL, 0UL, 1UL, 2UL, 3UL,
+    1UL, 2UL, 2UL, 1UL, 3UL, 1UL, 3UL, 2UL, 1UL, 3UL, 2UL, 2UL, 0UL, 2UL, 2UL, 3UL, 3UL, 1UL, 3UL, 2UL, 2UL, 2UL, 3UL,
+    0UL, 1UL, 3UL, 3UL, 3UL, 2UL, 2UL, 0UL, 0UL, 3UL, 0UL, 0UL, 2UL, 0UL, 3UL, 1UL, 1UL, 0UL, 1UL, 0UL, 3UL, 0UL, 0UL,
+    3UL, 0UL, 0UL, 2UL, 2UL, 0UL, 2UL, 3UL, 2UL, 2UL, 3UL, 1UL, 1UL, 2UL, 3UL, 1UL, 1UL, 0UL, 2UL, 1UL, 0UL, 3UL, 1UL,
+    2UL, 2UL, 2UL, 2UL, 1UL, 0UL, 0UL, 2UL, 0UL, 3UL, 2UL, 3UL, 2UL, 2UL, 2UL, 0UL, 2UL, 3UL, 1UL, 2UL, 1UL, 0UL, 3UL,
+    2UL, 3UL, 0UL, 2UL, 0UL, 2UL, 2UL, 3UL, 3UL, 0UL, 0UL, 2UL, 1UL, 1UL, 3UL, 0UL, 2UL, 1UL, 2UL, 2UL, 3UL, 1UL, 1UL,
+    2UL, 2UL, 0UL, 3UL, 3UL, 0UL, 1UL, 1UL, 1UL, 0UL, 2UL, 3UL, 2UL, 0UL, 3UL, 0UL, 2UL, 1UL, 3UL, 0UL, 1UL, 0UL, 1UL,
+    2UL, 2UL, 0UL, 2UL, 2UL, 1UL, 0UL, 3UL, 0UL, 3UL, 3UL, 0UL, 3UL, 2UL, 1UL, 1UL, 3UL, 2UL, 2UL, 1UL, 3UL, 1UL, 1UL,
+    2UL, 0UL, 3UL, 2UL, 3UL, 2UL, 2UL, 1UL, 1UL, 0UL, 1UL, 0UL, 3UL, 2UL, 1UL, 3UL, 2UL, 0UL, 2UL, 2UL, 3UL, 0UL, 2UL,
+    1UL, 1UL, 1UL, 1UL, 2UL, 3UL, 2UL, 2UL, 2UL, 2UL, 0UL, 3UL,
+};
+uint32_t rand_arr_3_b3_w32_arr[1024] = {
+    1UL, 4UL, 3UL, 1UL, 0UL, 2UL, 6UL, 3UL, 0UL, 6UL, 6UL, 4UL, 6UL, 2UL, 6UL, 5UL, 7UL, 0UL, 7UL, 2UL, 1UL, 5UL, 3UL,
+    0UL, 4UL, 2UL, 0UL, 0UL, 1UL, 0UL, 5UL, 4UL, 6UL, 4UL, 3UL, 6UL, 6UL, 6UL, 7UL, 7UL, 1UL, 3UL, 1UL, 6UL, 6UL, 5UL,
+    5UL, 0UL, 7UL, 0UL, 2UL, 2UL, 4UL, 4UL, 2UL, 4UL, 4UL, 6UL, 0UL, 0UL, 7UL, 6UL, 3UL, 2UL, 3UL, 4UL, 7UL, 7UL, 6UL,
+    7UL, 6UL, 2UL, 6UL, 2UL, 0UL, 3UL, 3UL, 2UL, 4UL, 3UL, 2UL, 4UL, 7UL, 6UL, 3UL, 6UL, 2UL, 0UL, 3UL, 1UL, 6UL, 1UL,
+    2UL, 2UL, 0UL, 1UL, 3UL, 1UL, 5UL, 0UL, 7UL, 4UL, 4UL, 7UL, 0UL, 0UL, 2UL, 4UL, 3UL, 3UL, 5UL, 2UL, 4UL, 4UL, 5UL,
+    4UL, 4UL, 1UL, 1UL, 3UL, 3UL, 4UL, 3UL, 7UL, 4UL, 5UL, 3UL, 0UL, 2UL, 3UL, 6UL, 1UL, 0UL, 1UL, 5UL, 6UL, 3UL, 6UL,
+    2UL, 5UL, 7UL, 7UL, 4UL, 3UL, 2UL, 5UL, 7UL, 0UL, 1UL, 5UL, 6UL, 4UL, 6UL, 5UL, 2UL, 3UL, 1UL, 3UL, 3UL, 3UL, 3UL,
+    3UL, 0UL, 1UL, 5UL, 7UL, 6UL, 5UL, 1UL, 1UL, 2UL, 0UL, 3UL, 1UL, 5UL, 6UL, 6UL, 5UL, 2UL, 7UL, 5UL, 2UL, 1UL, 1UL,
+    4UL, 1UL, 5UL, 0UL, 6UL, 5UL, 1UL, 5UL, 0UL, 5UL, 0UL, 3UL, 0UL, 1UL, 2UL, 3UL, 7UL, 6UL, 1UL, 4UL, 5UL, 1UL, 1UL,
+    4UL, 2UL, 1UL, 0UL, 5UL, 1UL, 2UL, 2UL, 5UL, 3UL, 3UL, 6UL, 4UL, 3UL, 2UL, 2UL, 0UL, 1UL, 5UL, 5UL, 0UL, 7UL, 0UL,
+    0UL, 1UL, 7UL, 2UL, 4UL, 5UL, 7UL, 4UL, 5UL, 7UL, 7UL, 2UL, 4UL, 4UL, 7UL, 3UL, 5UL, 6UL, 3UL, 0UL, 2UL, 2UL, 4UL,
+    3UL, 2UL, 6UL, 0UL, 6UL, 5UL, 2UL, 4UL, 4UL, 2UL, 1UL, 6UL, 2UL, 6UL, 7UL, 5UL, 6UL, 6UL, 6UL, 0UL, 7UL, 4UL, 1UL,
+    7UL, 7UL, 0UL, 4UL, 2UL, 1UL, 2UL, 4UL, 0UL, 3UL, 4UL, 4UL, 3UL, 3UL, 1UL, 6UL, 1UL, 3UL, 4UL, 7UL, 3UL, 4UL, 4UL,
+    3UL, 7UL, 5UL, 7UL, 7UL, 7UL, 7UL, 4UL, 3UL, 6UL, 5UL, 0UL, 5UL, 5UL, 7UL, 5UL, 3UL, 0UL, 6UL, 6UL, 1UL, 5UL, 5UL,
+    3UL, 1UL, 1UL, 4UL, 5UL, 5UL, 2UL, 2UL, 6UL, 7UL, 0UL, 2UL, 5UL, 6UL, 7UL, 5UL, 1UL, 2UL, 4UL, 1UL, 7UL, 3UL, 2UL,
+    1UL, 4UL, 7UL, 4UL, 4UL, 4UL, 0UL, 6UL, 6UL, 2UL, 5UL, 1UL, 7UL, 5UL, 6UL, 1UL, 7UL, 6UL, 3UL, 7UL, 7UL, 0UL, 6UL,
+    6UL, 5UL, 0UL, 3UL, 0UL, 1UL, 2UL, 5UL, 2UL, 4UL, 3UL, 6UL, 6UL, 1UL, 4UL, 5UL, 6UL, 7UL, 2UL, 4UL, 3UL, 6UL, 4UL,
+    0UL, 7UL, 5UL, 6UL, 4UL, 6UL, 4UL, 2UL, 5UL, 3UL, 0UL, 1UL, 4UL, 0UL, 6UL, 4UL, 6UL, 4UL, 5UL, 7UL, 6UL, 6UL, 6UL,
+    0UL, 2UL, 6UL, 1UL, 4UL, 7UL, 1UL, 5UL, 3UL, 4UL, 7UL, 0UL, 3UL, 6UL, 4UL, 3UL, 5UL, 7UL, 0UL, 1UL, 6UL, 6UL, 7UL,
+    5UL, 6UL, 5UL, 3UL, 7UL, 4UL, 3UL, 6UL, 6UL, 2UL, 0UL, 1UL, 3UL, 1UL, 3UL, 7UL, 1UL, 5UL, 0UL, 6UL, 2UL, 4UL, 7UL,
+    6UL, 5UL, 2UL, 3UL, 1UL, 0UL, 0UL, 7UL, 1UL, 1UL, 6UL, 7UL, 5UL, 3UL, 0UL, 6UL, 7UL, 2UL, 4UL, 6UL, 1UL, 6UL, 4UL,
+    2UL, 1UL, 6UL, 0UL, 3UL, 1UL, 2UL, 0UL, 0UL, 3UL, 2UL, 4UL, 5UL, 4UL, 5UL, 0UL, 5UL, 4UL, 5UL, 7UL, 4UL, 5UL, 6UL,
+    6UL, 6UL, 2UL, 1UL, 5UL, 3UL, 0UL, 0UL, 3UL, 2UL, 4UL, 3UL, 2UL, 7UL, 2UL, 3UL, 3UL, 6UL, 7UL, 4UL, 0UL, 6UL, 3UL,
+    1UL, 7UL, 1UL, 3UL, 7UL, 3UL, 1UL, 6UL, 6UL, 0UL, 0UL, 5UL, 4UL, 0UL, 1UL, 3UL, 4UL, 2UL, 0UL, 5UL, 4UL, 6UL, 4UL,
+    7UL, 2UL, 7UL, 7UL, 1UL, 6UL, 2UL, 4UL, 0UL, 6UL, 1UL, 6UL, 3UL, 5UL, 6UL, 7UL, 7UL, 6UL, 2UL, 5UL, 4UL, 4UL, 6UL,
+    1UL, 7UL, 6UL, 7UL, 5UL, 5UL, 2UL, 0UL, 3UL, 6UL, 5UL, 1UL, 2UL, 2UL, 3UL, 5UL, 0UL, 3UL, 2UL, 3UL, 1UL, 3UL, 3UL,
+    0UL, 3UL, 7UL, 3UL, 1UL, 2UL, 6UL, 2UL, 5UL, 5UL, 1UL, 1UL, 6UL, 0UL, 6UL, 1UL, 2UL, 5UL, 4UL, 7UL, 0UL, 5UL, 3UL,
+    2UL, 2UL, 3UL, 7UL, 1UL, 7UL, 6UL, 7UL, 5UL, 1UL, 5UL, 3UL, 0UL, 4UL, 4UL, 2UL, 5UL, 0UL, 5UL, 6UL, 7UL, 5UL, 7UL,
+    6UL, 6UL, 0UL, 1UL, 5UL, 5UL, 7UL, 4UL, 1UL, 3UL, 0UL, 4UL, 3UL, 1UL, 5UL, 4UL, 3UL, 2UL, 3UL, 0UL, 4UL, 4UL, 0UL,
+    3UL, 6UL, 0UL, 3UL, 6UL, 7UL, 1UL, 1UL, 5UL, 6UL, 1UL, 5UL, 2UL, 4UL, 3UL, 5UL, 0UL, 3UL, 5UL, 7UL, 0UL, 5UL, 5UL,
+    6UL, 4UL, 5UL, 2UL, 0UL, 1UL, 7UL, 0UL, 0UL, 5UL, 3UL, 5UL, 5UL, 3UL, 6UL, 2UL, 4UL, 3UL, 5UL, 7UL, 2UL, 5UL, 3UL,
+    7UL, 6UL, 5UL, 4UL, 1UL, 0UL, 5UL, 7UL, 5UL, 1UL, 5UL, 3UL, 3UL, 3UL, 6UL, 7UL, 1UL, 7UL, 2UL, 2UL, 3UL, 1UL, 6UL,
+    3UL, 3UL, 6UL, 4UL, 2UL, 1UL, 2UL, 0UL, 3UL, 6UL, 2UL, 5UL, 5UL, 2UL, 3UL, 3UL, 0UL, 1UL, 0UL, 6UL, 2UL, 4UL, 0UL,
+    1UL, 3UL, 3UL, 0UL, 6UL, 6UL, 2UL, 3UL, 4UL, 6UL, 3UL, 4UL, 4UL, 5UL, 0UL, 0UL, 1UL, 4UL, 3UL, 1UL, 7UL, 3UL, 3UL,
+    3UL, 7UL, 1UL, 2UL, 5UL, 3UL, 4UL, 2UL, 5UL, 6UL, 2UL, 5UL, 2UL, 1UL, 3UL, 0UL, 5UL, 2UL, 5UL, 5UL, 6UL, 5UL, 3UL,
+    3UL, 4UL, 5UL, 0UL, 5UL, 3UL, 3UL, 5UL, 0UL, 0UL, 7UL, 0UL, 6UL, 2UL, 4UL, 1UL, 4UL, 0UL, 7UL, 7UL, 2UL, 6UL, 0UL,
+    6UL, 7UL, 7UL, 1UL, 5UL, 7UL, 4UL, 2UL, 5UL, 2UL, 1UL, 5UL, 5UL, 0UL, 5UL, 3UL, 5UL, 7UL, 7UL, 7UL, 3UL, 2UL, 5UL,
+    4UL, 2UL, 7UL, 0UL, 5UL, 3UL, 4UL, 6UL, 0UL, 4UL, 0UL, 4UL, 0UL, 1UL, 2UL, 5UL, 4UL, 0UL, 3UL, 0UL, 4UL, 6UL, 6UL,
+    2UL, 0UL, 4UL, 3UL, 7UL, 0UL, 5UL, 7UL, 2UL, 6UL, 5UL, 3UL, 6UL, 1UL, 3UL, 0UL, 0UL, 4UL, 7UL, 2UL, 1UL, 2UL, 2UL,
+    3UL, 7UL, 7UL, 0UL, 2UL, 2UL, 5UL, 1UL, 2UL, 3UL, 2UL, 2UL, 1UL, 1UL, 6UL, 2UL, 2UL, 6UL, 1UL, 7UL, 2UL, 4UL, 2UL,
+    1UL, 2UL, 6UL, 4UL, 0UL, 0UL, 6UL, 0UL, 5UL, 4UL, 7UL, 5UL, 5UL, 5UL, 2UL, 1UL, 1UL, 4UL, 7UL, 1UL, 1UL, 1UL, 4UL,
+    7UL, 1UL, 7UL, 2UL, 6UL, 3UL, 2UL, 5UL, 2UL, 5UL, 1UL, 2UL, 2UL, 5UL, 0UL, 1UL, 0UL, 0UL, 7UL, 2UL, 5UL, 7UL, 2UL,
+    5UL, 7UL, 1UL, 5UL, 0UL, 1UL, 7UL, 6UL, 2UL, 3UL, 2UL, 5UL, 2UL, 0UL, 3UL, 3UL, 2UL, 3UL, 1UL, 6UL, 5UL, 6UL, 0UL,
+    3UL, 4UL, 3UL, 3UL, 1UL, 4UL, 0UL, 2UL, 6UL, 0UL, 6UL, 1UL, 6UL, 4UL, 2UL, 3UL, 5UL, 1UL, 5UL, 5UL, 5UL, 1UL, 3UL,
+    6UL, 1UL, 5UL, 0UL, 6UL, 5UL, 1UL, 1UL, 6UL, 5UL, 3UL, 0UL,
+};
+uint32_t rand_arr_4_b4_w32_arr[1024] = {
+    1UL,  8UL,  13UL, 5UL,  5UL,  1UL,  1UL,  0UL,  5UL,  14UL, 3UL,  1UL,  14UL, 11UL, 4UL,  3UL,  3UL,  5UL,  1UL,
+    8UL,  11UL, 4UL,  8UL,  2UL,  4UL,  5UL,  9UL,  11UL, 11UL, 1UL,  9UL,  0UL,  5UL,  14UL, 8UL,  13UL, 6UL,  10UL,
+    1UL,  7UL,  1UL,  4UL,  4UL,  15UL, 11UL, 13UL, 6UL,  8UL,  12UL, 0UL,  0UL,  2UL,  11UL, 1UL,  7UL,  5UL,  7UL,
+    0UL,  5UL,  4UL,  14UL, 0UL,  11UL, 3UL,  14UL, 12UL, 2UL,  12UL, 9UL,  9UL,  1UL,  0UL,  7UL,  6UL,  8UL,  3UL,
+    3UL,  11UL, 15UL, 13UL, 2UL,  5UL,  1UL,  0UL,  15UL, 9UL,  3UL,  4UL,  1UL,  2UL,  15UL, 8UL,  9UL,  1UL,  10UL,
+    11UL, 10UL, 5UL,  8UL,  7UL,  0UL,  11UL, 6UL,  4UL,  0UL,  3UL,  0UL,  1UL,  5UL,  8UL,  15UL, 8UL,  13UL, 3UL,
+    6UL,  2UL,  6UL,  0UL,  4UL,  8UL,  2UL,  3UL,  10UL, 1UL,  1UL,  4UL,  8UL,  11UL, 1UL,  7UL,  5UL,  9UL,  10UL,
+    2UL,  1UL,  0UL,  6UL,  8UL,  7UL,  0UL,  5UL,  10UL, 4UL,  0UL,  0UL,  2UL,  2UL,  0UL,  10UL, 9UL,  12UL, 7UL,
+    12UL, 7UL,  3UL,  9UL,  12UL, 6UL,  4UL,  15UL, 2UL,  13UL, 7UL,  8UL,  6UL,  11UL, 2UL,  7UL,  9UL,  14UL, 8UL,
+    7UL,  4UL,  6UL,  15UL, 3UL,  7UL,  15UL, 2UL,  7UL,  15UL, 9UL,  12UL, 4UL,  15UL, 5UL,  10UL, 4UL,  5UL,  13UL,
+    6UL,  4UL,  14UL, 13UL, 2UL,  4UL,  8UL,  5UL,  8UL,  13UL, 15UL, 14UL, 9UL,  1UL,  14UL, 13UL, 7UL,  8UL,  8UL,
+    6UL,  12UL, 2UL,  12UL, 10UL, 11UL, 1UL,  2UL,  7UL,  2UL,  10UL, 4UL,  15UL, 5UL,  5UL,  12UL, 6UL,  2UL,  11UL,
+    4UL,  14UL, 8UL,  4UL,  11UL, 12UL, 11UL, 8UL,  10UL, 5UL,  7UL,  15UL, 15UL, 2UL,  5UL,  2UL,  4UL,  9UL,  3UL,
+    15UL, 0UL,  13UL, 7UL,  3UL,  10UL, 0UL,  5UL,  4UL,  0UL,  7UL,  13UL, 6UL,  4UL,  12UL, 6UL,  10UL, 4UL,  15UL,
+    7UL,  1UL,  13UL, 13UL, 12UL, 11UL, 10UL, 8UL,  4UL,  7UL,  11UL, 9UL,  0UL,  13UL, 0UL,  2UL,  1UL,  7UL,  2UL,
+    14UL, 12UL, 1UL,  1UL,  6UL,  3UL,  2UL,  1UL,  0UL,  13UL, 6UL,  4UL,  13UL, 3UL,  10UL, 13UL, 2UL,  7UL,  11UL,
+    7UL,  12UL, 7UL,  15UL, 15UL, 15UL, 10UL, 9UL,  5UL,  5UL,  8UL,  4UL,  8UL,  13UL, 7UL,  1UL,  6UL,  13UL, 7UL,
+    3UL,  1UL,  5UL,  1UL,  4UL,  5UL,  1UL,  5UL,  2UL,  2UL,  0UL,  11UL, 11UL, 4UL,  11UL, 6UL,  0UL,  9UL,  3UL,
+    4UL,  1UL,  1UL,  4UL,  12UL, 2UL,  8UL,  5UL,  14UL, 1UL,  5UL,  7UL,  11UL, 3UL,  3UL,  15UL, 0UL,  6UL,  4UL,
+    6UL,  11UL, 6UL,  3UL,  11UL, 4UL,  15UL, 5UL,  3UL,  13UL, 3UL,  9UL,  5UL,  6UL,  15UL, 15UL, 8UL,  11UL, 3UL,
+    3UL,  3UL,  2UL,  4UL,  14UL, 3UL,  13UL, 4UL,  4UL,  6UL,  2UL,  5UL,  9UL,  7UL,  13UL, 1UL,  2UL,  11UL, 3UL,
+    12UL, 10UL, 13UL, 6UL,  15UL, 10UL, 1UL,  11UL, 6UL,  4UL,  0UL,  1UL,  7UL,  15UL, 8UL,  2UL,  9UL,  3UL,  9UL,
+    0UL,  7UL,  15UL, 11UL, 8UL,  15UL, 7UL,  7UL,  10UL, 1UL,  9UL,  10UL, 2UL,  13UL, 4UL,  13UL, 8UL,  7UL,  4UL,
+    15UL, 6UL,  9UL,  14UL, 2UL,  6UL,  8UL,  15UL, 2UL,  7UL,  15UL, 4UL,  7UL,  15UL, 6UL,  11UL, 2UL,  8UL,  4UL,
+    5UL,  0UL,  14UL, 7UL,  14UL, 9UL,  11UL, 3UL,  2UL,  0UL,  11UL, 7UL,  5UL,  8UL,  5UL,  15UL, 12UL, 14UL, 14UL,
+    0UL,  0UL,  5UL,  6UL,  9UL,  6UL,  1UL,  7UL,  7UL,  10UL, 1UL,  7UL,  14UL, 7UL,  1UL,  9UL,  0UL,  12UL, 14UL,
+    13UL, 12UL, 0UL,  11UL, 1UL,  14UL, 13UL, 14UL, 5UL,  12UL, 13UL, 14UL, 8UL,  2UL,  8UL,  3UL,  0UL,  14UL, 5UL,
+    2UL,  11UL, 11UL, 2UL,  9UL,  6UL,  11UL, 10UL, 3UL,  5UL,  3UL,  0UL,  10UL, 6UL,  9UL,  10UL, 10UL, 1UL,  2UL,
+    8UL,  6UL,  4UL,  8UL,  13UL, 12UL, 0UL,  15UL, 14UL, 11UL, 10UL, 11UL, 5UL,  0UL,  4UL,  0UL,  1UL,  11UL, 4UL,
+    4UL,  2UL,  2UL,  12UL, 3UL,  8UL,  2UL,  1UL,  9UL,  8UL,  7UL,  1UL,  6UL,  4UL,  7UL,  0UL,  1UL,  10UL, 9UL,
+    7UL,  2UL,  15UL, 7UL,  1UL,  1UL,  6UL,  14UL, 14UL, 8UL,  6UL,  12UL, 5UL,  1UL,  5UL,  1UL,  8UL,  5UL,  9UL,
+    5UL,  14UL, 8UL,  0UL,  3UL,  3UL,  14UL, 8UL,  15UL, 15UL, 6UL,  9UL,  6UL,  6UL,  1UL,  2UL,  5UL,  7UL,  7UL,
+    7UL,  0UL,  12UL, 1UL,  2UL,  6UL,  13UL, 11UL, 4UL,  3UL,  5UL,  8UL,  15UL, 1UL,  10UL, 2UL,  14UL, 12UL, 10UL,
+    14UL, 12UL, 12UL, 0UL,  11UL, 7UL,  10UL, 13UL, 3UL,  4UL,  2UL,  2UL,  13UL, 15UL, 4UL,  7UL,  5UL,  14UL, 0UL,
+    4UL,  13UL, 1UL,  7UL,  5UL,  8UL,  13UL, 9UL,  12UL, 10UL, 7UL,  15UL, 6UL,  2UL,  0UL,  13UL, 12UL, 4UL,  1UL,
+    14UL, 6UL,  2UL,  15UL, 4UL,  15UL, 2UL,  1UL,  8UL,  5UL,  10UL, 1UL,  7UL,  6UL,  14UL, 4UL,  12UL, 4UL,  3UL,
+    5UL,  15UL, 11UL, 0UL,  5UL,  7UL,  6UL,  5UL,  0UL,  4UL,  14UL, 10UL, 8UL,  2UL,  3UL,  15UL, 6UL,  10UL, 1UL,
+    4UL,  11UL, 6UL,  8UL,  1UL,  8UL,  5UL,  12UL, 11UL, 6UL,  12UL, 1UL,  10UL, 7UL,  14UL, 7UL,  6UL,  8UL,  0UL,
+    5UL,  7UL,  15UL, 0UL,  11UL, 3UL,  4UL,  11UL, 15UL, 6UL,  7UL,  4UL,  1UL,  9UL,  14UL, 13UL, 12UL, 10UL, 11UL,
+    12UL, 5UL,  2UL,  15UL, 14UL, 9UL,  6UL,  12UL, 6UL,  8UL,  9UL,  5UL,  5UL,  13UL, 7UL,  11UL, 0UL,  7UL,  3UL,
+    15UL, 9UL,  9UL,  6UL,  8UL,  12UL, 5UL,  8UL,  8UL,  9UL,  2UL,  6UL,  5UL,  7UL,  10UL, 6UL,  8UL,  2UL,  15UL,
+    11UL, 8UL,  6UL,  3UL,  14UL, 4UL,  8UL,  1UL,  4UL,  6UL,  15UL, 0UL,  15UL, 9UL,  3UL,  13UL, 0UL,  11UL, 2UL,
+    4UL,  0UL,  1UL,  3UL,  7UL,  12UL, 2UL,  0UL,  7UL,  14UL, 2UL,  7UL,  3UL,  4UL,  5UL,  4UL,  2UL,  10UL, 5UL,
+    10UL, 7UL,  15UL, 3UL,  9UL,  7UL,  12UL, 12UL, 10UL, 7UL,  10UL, 7UL,  15UL, 3UL,  9UL,  9UL,  2UL,  14UL, 0UL,
+    13UL, 7UL,  8UL,  4UL,  13UL, 7UL,  14UL, 10UL, 15UL, 10UL, 2UL,  3UL,  8UL,  11UL, 13UL, 14UL, 12UL, 3UL,  7UL,
+    12UL, 7UL,  4UL,  7UL,  13UL, 4UL,  7UL,  11UL, 0UL,  6UL,  4UL,  8UL,  3UL,  15UL, 3UL,  6UL,  15UL, 12UL, 8UL,
+    2UL,  4UL,  4UL,  14UL, 7UL,  12UL, 0UL,  8UL,  2UL,  5UL,  4UL,  4UL,  2UL,  4UL,  8UL,  2UL,  10UL, 15UL, 1UL,
+    6UL,  3UL,  7UL,  14UL, 6UL,  11UL, 12UL, 13UL, 6UL,  12UL, 10UL, 5UL,  9UL,  9UL,  9UL,  6UL,  13UL, 12UL, 13UL,
+    11UL, 9UL,  13UL, 10UL, 14UL, 3UL,  9UL,  2UL,  15UL, 0UL,  4UL,  14UL, 13UL, 11UL, 0UL,  12UL, 14UL, 13UL, 9UL,
+    10UL, 11UL, 11UL, 8UL,  8UL,  1UL,  0UL,  5UL,  9UL,  13UL, 15UL, 3UL,  14UL, 14UL, 13UL, 7UL,  4UL,  10UL, 0UL,
+    8UL,  12UL, 5UL,  14UL, 10UL, 2UL,  8UL,  10UL, 3UL,  4UL,  4UL,  13UL, 1UL,  11UL, 5UL,  10UL, 11UL, 12UL, 0UL,
+    3UL,  13UL, 15UL, 3UL,  13UL, 0UL,  10UL, 15UL, 5UL,  3UL,  1UL,  4UL,  6UL,  11UL, 7UL,  15UL, 5UL,  11UL, 12UL,
+    2UL,  14UL, 8UL,  3UL,  13UL, 5UL,  13UL, 15UL, 1UL,  3UL,  15UL, 5UL,  9UL,  2UL,  10UL, 12UL, 9UL,  14UL, 13UL,
+    3UL,  0UL,  4UL,  9UL,  11UL, 11UL, 4UL,  5UL,  5UL,  10UL, 2UL,  13UL, 11UL, 8UL,  5UL,  8UL,  14UL,
+};
+uint32_t rand_arr_5_b5_w32_arr[1024] = {
+    8UL,  17UL, 10UL, 10UL, 15UL, 26UL, 25UL, 8UL,  29UL, 29UL, 21UL, 6UL,  22UL, 3UL,  4UL,  0UL,  19UL, 16UL, 4UL,
+    27UL, 22UL, 25UL, 18UL, 24UL, 13UL, 14UL, 15UL, 19UL, 15UL, 15UL, 4UL,  28UL, 9UL,  14UL, 19UL, 8UL,  10UL, 23UL,
+    22UL, 9UL,  25UL, 4UL,  12UL, 16UL, 6UL,  5UL,  27UL, 23UL, 6UL,  16UL, 29UL, 17UL, 19UL, 0UL,  22UL, 13UL, 30UL,
+    0UL,  5UL,  20UL, 9UL,  0UL,  0UL,  19UL, 11UL, 28UL, 12UL, 5UL,  15UL, 7UL,  13UL, 18UL, 21UL, 25UL, 27UL, 7UL,
+    21UL, 26UL, 1UL,  13UL, 15UL, 3UL,  18UL, 3UL,  25UL, 29UL, 12UL, 13UL, 25UL, 21UL, 23UL, 10UL, 12UL, 3UL,  27UL,
+    2UL,  10UL, 21UL, 3UL,  9UL,  1UL,  24UL, 25UL, 2UL,  30UL, 18UL, 20UL, 14UL, 8UL,  12UL, 5UL,  0UL,  8UL,  30UL,
+    23UL, 21UL, 1UL,  31UL, 23UL, 29UL, 4UL,  9UL,  7UL,  10UL, 7UL,  16UL, 0UL,  30UL, 10UL, 22UL, 7UL,  22UL, 14UL,
+    25UL, 16UL, 10UL, 11UL, 31UL, 16UL, 17UL, 8UL,  20UL, 30UL, 0UL,  15UL, 1UL,  20UL, 3UL,  23UL, 30UL, 26UL, 30UL,
+    0UL,  14UL, 28UL, 1UL,  17UL, 30UL, 26UL, 24UL, 23UL, 27UL, 30UL, 22UL, 3UL,  3UL,  12UL, 13UL, 9UL,  24UL, 25UL,
+    26UL, 5UL,  1UL,  5UL,  7UL,  20UL, 14UL, 13UL, 26UL, 6UL,  26UL, 31UL, 24UL, 1UL,  10UL, 6UL,  2UL,  22UL, 30UL,
+    4UL,  28UL, 5UL,  6UL,  6UL,  12UL, 10UL, 20UL, 28UL, 27UL, 4UL,  21UL, 30UL, 1UL,  7UL,  11UL, 2UL,  18UL, 20UL,
+    4UL,  23UL, 25UL, 6UL,  22UL, 31UL, 0UL,  11UL, 1UL,  6UL,  15UL, 7UL,  28UL, 29UL, 4UL,  28UL, 18UL, 28UL, 24UL,
+    19UL, 2UL,  5UL,  30UL, 20UL, 29UL, 28UL, 0UL,  15UL, 14UL, 22UL, 23UL, 2UL,  20UL, 15UL, 5UL,  20UL, 25UL, 17UL,
+    26UL, 8UL,  6UL,  14UL, 8UL,  5UL,  28UL, 6UL,  25UL, 10UL, 12UL, 4UL,  2UL,  10UL, 19UL, 20UL, 12UL, 0UL,  7UL,
+    10UL, 11UL, 0UL,  6UL,  2UL,  17UL, 15UL, 26UL, 15UL, 14UL, 31UL, 25UL, 21UL, 28UL, 17UL, 2UL,  6UL,  28UL, 1UL,
+    7UL,  12UL, 14UL, 10UL, 13UL, 23UL, 4UL,  28UL, 1UL,  15UL, 8UL,  26UL, 16UL, 1UL,  21UL, 13UL, 20UL, 7UL,  1UL,
+    4UL,  25UL, 25UL, 19UL, 12UL, 22UL, 15UL, 12UL, 22UL, 11UL, 19UL, 23UL, 25UL, 9UL,  7UL,  16UL, 24UL, 25UL, 27UL,
+    21UL, 20UL, 10UL, 20UL, 16UL, 6UL,  14UL, 31UL, 19UL, 18UL, 20UL, 25UL, 3UL,  29UL, 18UL, 3UL,  5UL,  16UL, 23UL,
+    3UL,  12UL, 15UL, 3UL,  2UL,  31UL, 15UL, 10UL, 17UL, 30UL, 29UL, 21UL, 21UL, 4UL,  5UL,  29UL, 3UL,  26UL, 25UL,
+    15UL, 7UL,  27UL, 14UL, 26UL, 20UL, 6UL,  14UL, 24UL, 4UL,  16UL, 18UL, 16UL, 13UL, 29UL, 20UL, 2UL,  1UL,  4UL,
+    19UL, 8UL,  14UL, 27UL, 15UL, 31UL, 22UL, 4UL,  14UL, 8UL,  1UL,  2UL,  27UL, 2UL,  21UL, 30UL, 28UL, 18UL, 3UL,
+    4UL,  14UL, 27UL, 27UL, 9UL,  2UL,  0UL,  26UL, 31UL, 28UL, 13UL, 22UL, 17UL, 10UL, 19UL, 19UL, 1UL,  4UL,  12UL,
+    17UL, 30UL, 9UL,  23UL, 21UL, 29UL, 27UL, 28UL, 15UL, 21UL, 30UL, 17UL, 9UL,  13UL, 20UL, 26UL, 18UL, 19UL, 7UL,
+    20UL, 27UL, 25UL, 7UL,  19UL, 19UL, 30UL, 14UL, 3UL,  20UL, 2UL,  31UL, 7UL,  9UL,  7UL,  24UL, 20UL, 26UL, 21UL,
+    20UL, 10UL, 29UL, 12UL, 5UL,  21UL, 17UL, 21UL, 21UL, 0UL,  31UL, 13UL, 22UL, 30UL, 31UL, 23UL, 29UL, 23UL, 19UL,
+    31UL, 9UL,  20UL, 5UL,  3UL,  31UL, 28UL, 1UL,  15UL, 4UL,  18UL, 31UL, 21UL, 6UL,  9UL,  11UL, 12UL, 23UL, 23UL,
+    18UL, 17UL, 25UL, 28UL, 6UL,  3UL,  3UL,  29UL, 21UL, 17UL, 6UL,  21UL, 27UL, 26UL, 12UL, 24UL, 17UL, 21UL, 10UL,
+    16UL, 8UL,  1UL,  14UL, 18UL, 31UL, 14UL, 13UL, 7UL,  14UL, 1UL,  22UL, 24UL, 10UL, 31UL, 8UL,  18UL, 8UL,  16UL,
+    22UL, 22UL, 10UL, 12UL, 29UL, 13UL, 19UL, 27UL, 29UL, 14UL, 1UL,  10UL, 24UL, 6UL,  10UL, 26UL, 30UL, 8UL,  0UL,
+    5UL,  27UL, 20UL, 9UL,  24UL, 7UL,  17UL, 13UL, 30UL, 28UL, 24UL, 16UL, 20UL, 8UL,  11UL, 27UL, 6UL,  5UL,  6UL,
+    31UL, 16UL, 0UL,  26UL, 20UL, 1UL,  7UL,  22UL, 27UL, 14UL, 31UL, 12UL, 27UL, 0UL,  29UL, 20UL, 1UL,  27UL, 17UL,
+    25UL, 31UL, 12UL, 5UL,  20UL, 31UL, 7UL,  3UL,  3UL,  15UL, 15UL, 29UL, 28UL, 13UL, 10UL, 31UL, 14UL, 23UL, 0UL,
+    31UL, 9UL,  29UL, 11UL, 8UL,  9UL,  9UL,  29UL, 22UL, 7UL,  18UL, 18UL, 8UL,  30UL, 21UL, 19UL, 18UL, 0UL,  15UL,
+    29UL, 31UL, 26UL, 13UL, 31UL, 16UL, 16UL, 8UL,  25UL, 10UL, 27UL, 14UL, 21UL, 0UL,  25UL, 27UL, 7UL,  22UL, 5UL,
+    28UL, 0UL,  6UL,  15UL, 12UL, 30UL, 5UL,  7UL,  10UL, 13UL, 2UL,  25UL, 23UL, 14UL, 26UL, 22UL, 8UL,  8UL,  8UL,
+    3UL,  25UL, 31UL, 5UL,  19UL, 28UL, 19UL, 11UL, 21UL, 23UL, 13UL, 21UL, 16UL, 26UL, 11UL, 26UL, 15UL, 25UL, 23UL,
+    16UL, 28UL, 6UL,  24UL, 9UL,  24UL, 31UL, 8UL,  18UL, 23UL, 23UL, 24UL, 28UL, 1UL,  19UL, 27UL, 27UL, 25UL, 1UL,
+    6UL,  9UL,  8UL,  22UL, 10UL, 18UL, 21UL, 12UL, 24UL, 26UL, 7UL,  7UL,  4UL,  23UL, 31UL, 19UL, 16UL, 30UL, 4UL,
+    26UL, 19UL, 14UL, 1UL,  16UL, 31UL, 12UL, 5UL,  2UL,  7UL,  17UL, 23UL, 18UL, 29UL, 1UL,  4UL,  7UL,  12UL, 25UL,
+    8UL,  18UL, 28UL, 4UL,  29UL, 4UL,  26UL, 27UL, 1UL,  0UL,  9UL,  23UL, 15UL, 4UL,  29UL, 31UL, 25UL, 9UL,  9UL,
+    28UL, 27UL, 30UL, 26UL, 16UL, 3UL,  21UL, 29UL, 4UL,  4UL,  27UL, 4UL,  28UL, 5UL,  4UL,  26UL, 23UL, 12UL, 4UL,
+    9UL,  15UL, 21UL, 17UL, 12UL, 0UL,  22UL, 25UL, 4UL,  24UL, 0UL,  10UL, 19UL, 22UL, 25UL, 17UL, 17UL, 29UL, 4UL,
+    30UL, 26UL, 16UL, 24UL, 26UL, 4UL,  5UL,  15UL, 14UL, 8UL,  5UL,  19UL, 8UL,  14UL, 25UL, 8UL,  7UL,  13UL, 10UL,
+    12UL, 27UL, 13UL, 24UL, 22UL, 8UL,  20UL, 3UL,  10UL, 31UL, 13UL, 13UL, 14UL, 24UL, 30UL, 10UL, 0UL,  19UL, 27UL,
+    26UL, 21UL, 26UL, 1UL,  1UL,  7UL,  12UL, 5UL,  5UL,  31UL, 16UL, 14UL, 1UL,  22UL, 7UL,  31UL, 18UL, 11UL, 11UL,
+    0UL,  13UL, 10UL, 6UL,  3UL,  11UL, 0UL,  0UL,  29UL, 23UL, 12UL, 6UL,  24UL, 16UL, 0UL,  0UL,  3UL,  25UL, 1UL,
+    22UL, 18UL, 2UL,  1UL,  3UL,  18UL, 30UL, 4UL,  5UL,  16UL, 2UL,  21UL, 20UL, 2UL,  29UL, 2UL,  24UL, 2UL,  29UL,
+    2UL,  22UL, 13UL, 21UL, 17UL, 24UL, 6UL,  20UL, 22UL, 8UL,  16UL, 4UL,  1UL,  4UL,  25UL, 8UL,  19UL, 5UL,  29UL,
+    25UL, 9UL,  8UL,  26UL, 5UL,  0UL,  26UL, 29UL, 15UL, 18UL, 7UL,  31UL, 8UL,  17UL, 28UL, 13UL, 10UL, 20UL, 24UL,
+    29UL, 2UL,  14UL, 16UL, 24UL, 7UL,  29UL, 24UL, 2UL,  24UL, 2UL,  10UL, 20UL, 10UL, 14UL, 3UL,  15UL, 2UL,  31UL,
+    25UL, 6UL,  1UL,  1UL,  18UL, 30UL, 17UL, 26UL, 1UL,  12UL, 12UL, 9UL,  2UL,  25UL, 0UL,  0UL,  27UL, 5UL,  20UL,
+    15UL, 13UL, 18UL, 1UL,  26UL, 9UL,  5UL,  29UL, 11UL, 8UL,  18UL, 28UL, 31UL, 28UL, 14UL, 2UL,  14UL, 2UL,  3UL,
+    28UL, 11UL, 24UL, 8UL,  31UL, 11UL, 5UL,  31UL, 24UL, 11UL, 5UL,  29UL, 9UL,  23UL, 31UL, 3UL,  1UL,  15UL, 20UL,
+    1UL,  16UL, 29UL, 21UL, 23UL, 17UL, 27UL, 24UL, 30UL, 14UL, 14UL, 17UL, 12UL, 14UL, 14UL, 29UL, 17UL,
+};
+uint32_t rand_arr_6_b6_w32_arr[1024] = {
+    41UL, 8UL,  11UL, 50UL, 58UL, 11UL, 13UL, 38UL, 16UL, 51UL, 46UL, 53UL, 53UL, 26UL, 23UL, 23UL, 41UL, 24UL, 20UL,
+    23UL, 6UL,  16UL, 24UL, 17UL, 32UL, 59UL, 20UL, 26UL, 20UL, 3UL,  24UL, 10UL, 38UL, 32UL, 23UL, 29UL, 38UL, 23UL,
+    56UL, 22UL, 7UL,  16UL, 6UL,  56UL, 29UL, 10UL, 59UL, 63UL, 61UL, 39UL, 29UL, 30UL, 36UL, 50UL, 43UL, 2UL,  36UL,
+    44UL, 32UL, 7UL,  59UL, 42UL, 32UL, 59UL, 26UL, 31UL, 34UL, 2UL,  8UL,  30UL, 45UL, 57UL, 16UL, 34UL, 17UL, 12UL,
+    19UL, 13UL, 47UL, 3UL,  11UL, 14UL, 8UL,  6UL,  11UL, 55UL, 40UL, 57UL, 41UL, 6UL,  9UL,  50UL, 55UL, 11UL, 43UL,
+    41UL, 6UL,  21UL, 38UL, 50UL, 20UL, 24UL, 26UL, 28UL, 48UL, 12UL, 45UL, 50UL, 26UL, 11UL, 15UL, 28UL, 20UL, 56UL,
+    22UL, 42UL, 56UL, 6UL,  12UL, 56UL, 9UL,  17UL, 52UL, 41UL, 35UL, 15UL, 46UL, 14UL, 13UL, 18UL, 6UL,  50UL, 54UL,
+    53UL, 44UL, 12UL, 45UL, 44UL, 61UL, 54UL, 48UL, 55UL, 2UL,  23UL, 14UL, 17UL, 24UL, 34UL, 41UL, 25UL, 62UL, 11UL,
+    58UL, 43UL, 10UL, 31UL, 11UL, 50UL, 63UL, 40UL, 3UL,  47UL, 16UL, 34UL, 45UL, 36UL, 32UL, 43UL, 44UL, 60UL, 19UL,
+    34UL, 20UL, 15UL, 61UL, 56UL, 54UL, 2UL,  27UL, 51UL, 13UL, 47UL, 40UL, 56UL, 28UL, 18UL, 6UL,  49UL, 1UL,  46UL,
+    45UL, 54UL, 45UL, 3UL,  40UL, 29UL, 60UL, 4UL,  28UL, 29UL, 55UL, 57UL, 3UL,  29UL, 8UL,  9UL,  15UL, 1UL,  10UL,
+    35UL, 19UL, 15UL, 52UL, 7UL,  32UL, 17UL, 50UL, 63UL, 11UL, 22UL, 11UL, 7UL,  58UL, 57UL, 36UL, 48UL, 4UL,  2UL,
+    7UL,  29UL, 37UL, 46UL, 63UL, 17UL, 56UL, 58UL, 60UL, 61UL, 54UL, 36UL, 59UL, 2UL,  19UL, 19UL, 43UL, 17UL, 32UL,
+    46UL, 30UL, 17UL, 43UL, 14UL, 30UL, 56UL, 46UL, 31UL, 1UL,  14UL, 25UL, 54UL, 52UL, 16UL, 25UL, 9UL,  44UL, 39UL,
+    2UL,  51UL, 46UL, 24UL, 53UL, 16UL, 8UL,  46UL, 36UL, 22UL, 60UL, 40UL, 3UL,  48UL, 63UL, 34UL, 40UL, 59UL, 7UL,
+    19UL, 32UL, 54UL, 56UL, 62UL, 46UL, 3UL,  4UL,  1UL,  35UL, 13UL, 0UL,  9UL,  34UL, 1UL,  4UL,  18UL, 22UL, 3UL,
+    50UL, 48UL, 59UL, 9UL,  30UL, 28UL, 48UL, 50UL, 7UL,  47UL, 55UL, 9UL,  54UL, 30UL, 33UL, 43UL, 5UL,  20UL, 46UL,
+    38UL, 23UL, 22UL, 29UL, 22UL, 40UL, 9UL,  37UL, 57UL, 62UL, 1UL,  35UL, 51UL, 20UL, 13UL, 59UL, 41UL, 3UL,  49UL,
+    47UL, 31UL, 18UL, 13UL, 28UL, 33UL, 2UL,  4UL,  1UL,  41UL, 7UL,  47UL, 24UL, 27UL, 43UL, 16UL, 7UL,  31UL, 26UL,
+    8UL,  28UL, 29UL, 14UL, 41UL, 22UL, 58UL, 30UL, 9UL,  21UL, 49UL, 25UL, 3UL,  43UL, 9UL,  44UL, 27UL, 50UL, 54UL,
+    20UL, 25UL, 47UL, 32UL, 31UL, 10UL, 48UL, 15UL, 61UL, 45UL, 62UL, 17UL, 47UL, 56UL, 18UL, 12UL, 30UL, 27UL, 21UL,
+    34UL, 30UL, 55UL, 57UL, 7UL,  32UL, 33UL, 11UL, 3UL,  56UL, 32UL, 37UL, 17UL, 11UL, 46UL, 62UL, 47UL, 5UL,  41UL,
+    11UL, 22UL, 18UL, 25UL, 47UL, 10UL, 2UL,  34UL, 50UL, 7UL,  40UL, 33UL, 49UL, 9UL,  60UL, 48UL, 22UL, 35UL, 25UL,
+    45UL, 30UL, 8UL,  6UL,  15UL, 38UL, 19UL, 22UL, 28UL, 11UL, 56UL, 11UL, 62UL, 32UL, 33UL, 20UL, 49UL, 23UL, 5UL,
+    24UL, 54UL, 27UL, 38UL, 54UL, 18UL, 3UL,  57UL, 16UL, 40UL, 35UL, 38UL, 44UL, 21UL, 54UL, 59UL, 11UL, 49UL, 13UL,
+    12UL, 56UL, 42UL, 40UL, 22UL, 36UL, 18UL, 33UL, 54UL, 4UL,  34UL, 57UL, 27UL, 12UL, 3UL,  57UL, 45UL, 38UL, 51UL,
+    63UL, 33UL, 2UL,  24UL, 16UL, 60UL, 12UL, 53UL, 0UL,  28UL, 38UL, 60UL, 42UL, 62UL, 17UL, 50UL, 1UL,  53UL, 48UL,
+    34UL, 16UL, 59UL, 48UL, 12UL, 31UL, 25UL, 29UL, 47UL, 62UL, 34UL, 57UL, 3UL,  38UL, 32UL, 53UL, 42UL, 47UL, 44UL,
+    47UL, 62UL, 59UL, 39UL, 31UL, 6UL,  53UL, 22UL, 27UL, 41UL, 10UL, 57UL, 9UL,  31UL, 45UL, 21UL, 1UL,  48UL, 55UL,
+    22UL, 29UL, 46UL, 15UL, 32UL, 45UL, 37UL, 48UL, 38UL, 28UL, 8UL,  16UL, 7UL,  51UL, 17UL, 15UL, 0UL,  9UL,  14UL,
+    50UL, 22UL, 52UL, 39UL, 24UL, 36UL, 2UL,  47UL, 19UL, 29UL, 2UL,  0UL,  61UL, 10UL, 24UL, 18UL, 10UL, 29UL, 14UL,
+    19UL, 56UL, 38UL, 34UL, 13UL, 36UL, 55UL, 32UL, 29UL, 13UL, 56UL, 35UL, 37UL, 23UL, 26UL, 26UL, 50UL, 54UL, 2UL,
+    23UL, 19UL, 44UL, 16UL, 11UL, 34UL, 46UL, 59UL, 15UL, 0UL,  6UL,  55UL, 39UL, 38UL, 26UL, 48UL, 11UL, 23UL, 22UL,
+    59UL, 3UL,  48UL, 19UL, 47UL, 39UL, 8UL,  21UL, 15UL, 16UL, 51UL, 63UL, 11UL, 56UL, 14UL, 7UL,  51UL, 44UL, 21UL,
+    28UL, 61UL, 16UL, 59UL, 60UL, 28UL, 34UL, 39UL, 24UL, 32UL, 8UL,  46UL, 14UL, 61UL, 51UL, 11UL, 60UL, 40UL, 62UL,
+    59UL, 47UL, 54UL, 59UL, 9UL,  25UL, 9UL,  7UL,  0UL,  53UL, 38UL, 1UL,  14UL, 32UL, 32UL, 31UL, 48UL, 7UL,  7UL,
+    0UL,  6UL,  48UL, 57UL, 8UL,  1UL,  10UL, 6UL,  31UL, 44UL, 39UL, 17UL, 29UL, 56UL, 61UL, 33UL, 58UL, 47UL, 8UL,
+    26UL, 11UL, 9UL,  7UL,  44UL, 53UL, 25UL, 23UL, 4UL,  21UL, 21UL, 40UL, 29UL, 47UL, 31UL, 42UL, 61UL, 46UL, 5UL,
+    42UL, 21UL, 33UL, 20UL, 8UL,  19UL, 57UL, 57UL, 0UL,  9UL,  33UL, 33UL, 28UL, 30UL, 43UL, 50UL, 18UL, 53UL, 39UL,
+    57UL, 46UL, 44UL, 11UL, 9UL,  39UL, 26UL, 30UL, 21UL, 41UL, 7UL,  38UL, 37UL, 20UL, 60UL, 5UL,  55UL, 58UL, 37UL,
+    5UL,  41UL, 50UL, 4UL,  4UL,  13UL, 23UL, 25UL, 62UL, 10UL, 46UL, 52UL, 61UL, 30UL, 55UL, 57UL, 14UL, 1UL,  54UL,
+    48UL, 39UL, 12UL, 48UL, 31UL, 13UL, 58UL, 26UL, 53UL, 32UL, 46UL, 14UL, 59UL, 34UL, 32UL, 56UL, 35UL, 20UL, 33UL,
+    30UL, 12UL, 27UL, 41UL, 29UL, 9UL,  6UL,  41UL, 10UL, 14UL, 5UL,  63UL, 38UL, 3UL,  6UL,  33UL, 61UL, 42UL, 56UL,
+    5UL,  51UL, 28UL, 13UL, 5UL,  60UL, 27UL, 7UL,  41UL, 60UL, 58UL, 28UL, 26UL, 51UL, 33UL, 4UL,  59UL, 17UL, 9UL,
+    30UL, 6UL,  43UL, 8UL,  50UL, 18UL, 14UL, 9UL,  49UL, 55UL, 25UL, 26UL, 52UL, 21UL, 52UL, 3UL,  18UL, 14UL, 3UL,
+    20UL, 8UL,  24UL, 27UL, 32UL, 22UL, 55UL, 58UL, 13UL, 36UL, 14UL, 35UL, 16UL, 32UL, 40UL, 53UL, 55UL, 30UL, 21UL,
+    54UL, 54UL, 52UL, 60UL, 17UL, 11UL, 17UL, 51UL, 29UL, 40UL, 39UL, 57UL, 36UL, 54UL, 5UL,  37UL, 63UL, 5UL,  10UL,
+    24UL, 45UL, 10UL, 59UL, 31UL, 46UL, 42UL, 6UL,  62UL, 56UL, 5UL,  63UL, 20UL, 44UL, 50UL, 3UL,  19UL, 37UL, 39UL,
+    30UL, 8UL,  1UL,  35UL, 32UL, 16UL, 53UL, 5UL,  3UL,  27UL, 16UL, 25UL, 32UL, 54UL, 31UL, 46UL, 51UL, 24UL, 51UL,
+    15UL, 2UL,  3UL,  6UL,  40UL, 15UL, 12UL, 38UL, 53UL, 7UL,  37UL, 36UL, 2UL,  58UL, 36UL, 29UL, 9UL,  55UL, 24UL,
+    23UL, 58UL, 50UL, 24UL, 32UL, 8UL,  19UL, 3UL,  28UL, 52UL, 15UL, 6UL,  47UL, 60UL, 22UL, 50UL, 6UL,  34UL, 58UL,
+    59UL, 19UL, 53UL, 32UL, 6UL,  41UL, 38UL, 1UL,  50UL, 9UL,  56UL, 60UL, 48UL, 44UL, 9UL,  2UL,  42UL, 38UL, 37UL,
+    18UL, 11UL, 11UL, 41UL, 22UL, 35UL, 21UL, 22UL, 41UL, 41UL, 7UL,  12UL, 7UL,  26UL, 35UL, 55UL, 37UL, 50UL, 5UL,
+    29UL, 53UL, 21UL, 12UL, 7UL,  5UL,  62UL, 50UL, 38UL, 31UL, 20UL, 39UL, 53UL, 44UL, 36UL, 48UL, 38UL,
+};
+uint32_t rand_arr_7_b7_w32_arr[1024] = {
+    85UL,  56UL,  107UL, 71UL,  29UL,  78UL,  1UL,   46UL,  0UL,   82UL,  108UL, 2UL,   83UL,  111UL, 3UL,   1UL,
+    70UL,  78UL,  122UL, 33UL,  81UL,  110UL, 22UL,  53UL,  17UL,  27UL,  19UL,  60UL,  17UL,  102UL, 23UL,  43UL,
+    86UL,  63UL,  55UL,  71UL,  110UL, 23UL,  5UL,   107UL, 49UL,  66UL,  14UL,  11UL,  15UL,  33UL,  103UL, 53UL,
+    87UL,  5UL,   87UL,  96UL,  0UL,   47UL,  51UL,  51UL,  9UL,   7UL,   4UL,   17UL,  84UL,  45UL,  102UL, 33UL,
+    87UL,  116UL, 103UL, 40UL,  33UL,  21UL,  70UL,  10UL,  72UL,  75UL,  16UL,  31UL,  50UL,  108UL, 73UL,  45UL,
+    61UL,  77UL,  126UL, 61UL,  120UL, 38UL,  124UL, 24UL,  32UL,  119UL, 9UL,   56UL,  100UL, 31UL,  62UL,  90UL,
+    0UL,   100UL, 43UL,  4UL,   93UL,  85UL,  34UL,  110UL, 54UL,  38UL,  115UL, 50UL,  115UL, 69UL,  49UL,  4UL,
+    116UL, 102UL, 61UL,  45UL,  104UL, 104UL, 126UL, 56UL,  124UL, 13UL,  92UL,  114UL, 10UL,  66UL,  97UL,  99UL,
+    79UL,  7UL,   40UL,  21UL,  61UL,  115UL, 85UL,  41UL,  70UL,  83UL,  6UL,   111UL, 17UL,  99UL,  69UL,  3UL,
+    8UL,   107UL, 8UL,   96UL,  27UL,  35UL,  73UL,  79UL,  38UL,  63UL,  15UL,  62UL,  33UL,  37UL,  66UL,  47UL,
+    67UL,  1UL,   46UL,  55UL,  79UL,  97UL,  18UL,  61UL,  62UL,  28UL,  120UL, 43UL,  11UL,  88UL,  35UL,  27UL,
+    46UL,  123UL, 114UL, 103UL, 71UL,  33UL,  66UL,  117UL, 106UL, 110UL, 5UL,   7UL,   113UL, 24UL,  35UL,  45UL,
+    35UL,  33UL,  4UL,   45UL,  96UL,  99UL,  112UL, 85UL,  122UL, 45UL,  44UL,  66UL,  17UL,  97UL,  120UL, 78UL,
+    9UL,   75UL,  23UL,  105UL, 109UL, 47UL,  66UL,  55UL,  13UL,  99UL,  46UL,  19UL,  29UL,  115UL, 107UL, 60UL,
+    26UL,  114UL, 127UL, 102UL, 113UL, 112UL, 108UL, 95UL,  72UL,  16UL,  44UL,  40UL,  94UL,  62UL,  41UL,  66UL,
+    90UL,  101UL, 65UL,  73UL,  85UL,  108UL, 108UL, 51UL,  62UL,  119UL, 6UL,   0UL,   39UL,  109UL, 22UL,  58UL,
+    43UL,  47UL,  14UL,  49UL,  52UL,  36UL,  80UL,  119UL, 110UL, 121UL, 111UL, 66UL,  118UL, 19UL,  102UL, 22UL,
+    53UL,  24UL,  9UL,   31UL,  65UL,  112UL, 110UL, 27UL,  100UL, 2UL,   26UL,  81UL,  110UL, 31UL,  124UL, 73UL,
+    60UL,  75UL,  95UL,  45UL,  5UL,   52UL,  116UL, 12UL,  100UL, 12UL,  104UL, 43UL,  39UL,  27UL,  47UL,  78UL,
+    80UL,  103UL, 1UL,   55UL,  116UL, 108UL, 115UL, 111UL, 40UL,  41UL,  106UL, 37UL,  1UL,   18UL,  116UL, 7UL,
+    57UL,  54UL,  81UL,  34UL,  111UL, 47UL,  76UL,  52UL,  116UL, 102UL, 96UL,  87UL,  82UL,  22UL,  115UL, 22UL,
+    119UL, 71UL,  101UL, 109UL, 45UL,  108UL, 93UL,  107UL, 47UL,  15UL,  106UL, 33UL,  5UL,   47UL,  12UL,  13UL,
+    115UL, 7UL,   5UL,   100UL, 83UL,  94UL,  28UL,  7UL,   24UL,  24UL,  54UL,  89UL,  95UL,  47UL,  23UL,  23UL,
+    98UL,  117UL, 99UL,  115UL, 107UL, 63UL,  111UL, 21UL,  73UL,  9UL,   12UL,  20UL,  93UL,  12UL,  121UL, 87UL,
+    57UL,  114UL, 114UL, 97UL,  71UL,  84UL,  98UL,  51UL,  8UL,   42UL,  57UL,  50UL,  104UL, 42UL,  53UL,  3UL,
+    76UL,  107UL, 104UL, 107UL, 24UL,  92UL,  60UL,  54UL,  80UL,  22UL,  5UL,   14UL,  70UL,  13UL,  75UL,  38UL,
+    12UL,  52UL,  46UL,  24UL,  76UL,  31UL,  38UL,  48UL,  106UL, 87UL,  122UL, 85UL,  45UL,  7UL,   60UL,  46UL,
+    111UL, 26UL,  52UL,  46UL,  63UL,  15UL,  85UL,  34UL,  73UL,  53UL,  36UL,  88UL,  50UL,  114UL, 27UL,  106UL,
+    106UL, 64UL,  0UL,   93UL,  70UL,  31UL,  68UL,  112UL, 12UL,  92UL,  111UL, 104UL, 113UL, 50UL,  94UL,  105UL,
+    41UL,  81UL,  83UL,  64UL,  52UL,  87UL,  30UL,  58UL,  22UL,  81UL,  69UL,  103UL, 13UL,  42UL,  24UL,  43UL,
+    74UL,  106UL, 83UL,  27UL,  28UL,  67UL,  113UL, 62UL,  0UL,   51UL,  121UL, 59UL,  89UL,  16UL,  65UL,  103UL,
+    51UL,  15UL,  27UL,  102UL, 120UL, 66UL,  41UL,  2UL,   61UL,  53UL,  51UL,  15UL,  29UL,  84UL,  117UL, 26UL,
+    61UL,  92UL,  57UL,  31UL,  11UL,  115UL, 103UL, 119UL, 101UL, 69UL,  50UL,  8UL,   12UL,  124UL, 66UL,  43UL,
+    74UL,  82UL,  60UL,  122UL, 89UL,  47UL,  80UL,  100UL, 121UL, 18UL,  35UL,  127UL, 100UL, 112UL, 14UL,  86UL,
+    107UL, 114UL, 71UL,  35UL,  66UL,  78UL,  27UL,  6UL,   75UL,  103UL, 98UL,  1UL,   13UL,  50UL,  101UL, 76UL,
+    4UL,   54UL,  29UL,  75UL,  56UL,  101UL, 44UL,  83UL,  40UL,  57UL,  122UL, 12UL,  38UL,  82UL,  12UL,  54UL,
+    47UL,  17UL,  18UL,  124UL, 44UL,  112UL, 53UL,  68UL,  26UL,  105UL, 10UL,  93UL,  96UL,  64UL,  80UL,  101UL,
+    95UL,  127UL, 87UL,  121UL, 33UL,  123UL, 115UL, 4UL,   22UL,  68UL,  30UL,  2UL,   25UL,  125UL, 21UL,  110UL,
+    43UL,  99UL,  9UL,   24UL,  48UL,  20UL,  15UL,  15UL,  1UL,   62UL,  54UL,  42UL,  77UL,  70UL,  48UL,  20UL,
+    91UL,  24UL,  33UL,  88UL,  109UL, 26UL,  5UL,   39UL,  73UL,  81UL,  7UL,   53UL,  37UL,  3UL,   125UL, 101UL,
+    17UL,  87UL,  18UL,  84UL,  87UL,  9UL,   4UL,   12UL,  108UL, 29UL,  16UL,  14UL,  14UL,  8UL,   58UL,  78UL,
+    29UL,  97UL,  88UL,  119UL, 84UL,  43UL,  10UL,  37UL,  52UL,  76UL,  105UL, 68UL,  68UL,  3UL,   91UL,  30UL,
+    75UL,  113UL, 105UL, 37UL,  10UL,  99UL,  43UL,  91UL,  22UL,  57UL,  126UL, 69UL,  97UL,  7UL,   32UL,  81UL,
+    88UL,  111UL, 38UL,  6UL,   96UL,  100UL, 38UL,  6UL,   46UL,  24UL,  111UL, 23UL,  89UL,  121UL, 126UL, 105UL,
+    113UL, 111UL, 15UL,  125UL, 16UL,  80UL,  84UL,  52UL,  57UL,  120UL, 92UL,  72UL,  126UL, 35UL,  80UL,  113UL,
+    33UL,  78UL,  46UL,  74UL,  126UL, 23UL,  31UL,  61UL,  7UL,   80UL,  2UL,   12UL,  89UL,  71UL,  61UL,  51UL,
+    74UL,  17UL,  24UL,  107UL, 59UL,  53UL,  44UL,  25UL,  7UL,   37UL,  22UL,  60UL,  72UL,  27UL,  34UL,  122UL,
+    41UL,  72UL,  42UL,  15UL,  81UL,  41UL,  13UL,  26UL,  6UL,   48UL,  72UL,  6UL,   9UL,   107UL, 85UL,  47UL,
+    12UL,  96UL,  121UL, 50UL,  23UL,  35UL,  38UL,  39UL,  100UL, 17UL,  38UL,  124UL, 73UL,  113UL, 59UL,  108UL,
+    72UL,  50UL,  116UL, 53UL,  73UL,  115UL, 56UL,  48UL,  117UL, 94UL,  117UL, 77UL,  32UL,  55UL,  59UL,  118UL,
+    33UL,  22UL,  70UL,  127UL, 108UL, 34UL,  32UL,  72UL,  126UL, 56UL,  3UL,   15UL,  63UL,  37UL,  4UL,   19UL,
+    4UL,   87UL,  36UL,  110UL, 117UL, 38UL,  83UL,  116UL, 95UL,  113UL, 22UL,  107UL, 108UL, 101UL, 3UL,   30UL,
+    72UL,  35UL,  120UL, 90UL,  105UL, 55UL,  21UL,  105UL, 34UL,  83UL,  92UL,  27UL,  29UL,  75UL,  108UL, 89UL,
+    122UL, 18UL,  95UL,  79UL,  61UL,  16UL,  9UL,   25UL,  46UL,  26UL,  45UL,  122UL, 58UL,  46UL,  5UL,   41UL,
+    108UL, 100UL, 39UL,  22UL,  43UL,  120UL, 69UL,  55UL,  9UL,   74UL,  99UL,  66UL,  17UL,  116UL, 79UL,  101UL,
+    97UL,  100UL, 71UL,  126UL, 59UL,  2UL,   122UL, 79UL,  97UL,  1UL,   28UL,  24UL,  29UL,  59UL,  83UL,  4UL,
+    92UL,  34UL,  57UL,  11UL,  81UL,  98UL,  25UL,  37UL,  32UL,  16UL,  98UL,  124UL, 66UL,  115UL, 110UL, 122UL,
+    126UL, 107UL, 40UL,  13UL,  114UL, 38UL,  48UL,  10UL,  16UL,  87UL,  26UL,  79UL,  52UL,  114UL, 70UL,  18UL,
+    3UL,   66UL,  42UL,  58UL,  104UL, 44UL,  127UL, 15UL,  11UL,  92UL,  97UL,  108UL, 104UL, 12UL,  69UL,  102UL,
+    103UL, 33UL,  57UL,  112UL, 38UL,  38UL,  87UL,  60UL,  54UL,  34UL,  116UL, 10UL,  104UL, 53UL,  9UL,   2UL,
+    74UL,  14UL,  10UL,  0UL,   66UL,  77UL,  51UL,  8UL,   78UL,  82UL,  39UL,  73UL,  15UL,  50UL,  78UL,  48UL,
+    52UL,  49UL,  41UL,  91UL,  85UL,  32UL,  79UL,  40UL,  9UL,   84UL,  13UL,  70UL,  68UL,  99UL,  102UL, 53UL,
+    118UL, 55UL,  60UL,  45UL,  24UL,  103UL, 27UL,  49UL,  99UL,  86UL,  0UL,   110UL, 89UL,  86UL,  121UL, 57UL,
+    106UL, 86UL,  118UL, 76UL,  101UL, 29UL,  16UL,  97UL,  57UL,  1UL,   82UL,  86UL,  82UL,  53UL,  118UL, 61UL,
+};
+uint32_t rand_arr_8_b8_w32_arr[1024] = {
+    238UL, 132UL, 121UL, 150UL, 241UL, 44UL,  160UL, 230UL, 100UL, 49UL,  6UL,   35UL,  162UL, 18UL,  158UL, 240UL,
+    2UL,   36UL,  153UL, 0UL,   141UL, 1UL,   122UL, 32UL,  85UL,  188UL, 244UL, 164UL, 0UL,   236UL, 214UL, 125UL,
+    184UL, 86UL,  232UL, 72UL,  217UL, 198UL, 193UL, 92UL,  236UL, 8UL,   102UL, 132UL, 3UL,   29UL,  111UL, 173UL,
+    233UL, 91UL,  234UL, 142UL, 120UL, 230UL, 78UL,  182UL, 108UL, 192UL, 35UL,  58UL,  170UL, 188UL, 66UL,  234UL,
+    205UL, 247UL, 172UL, 108UL, 207UL, 185UL, 70UL,  164UL, 219UL, 152UL, 37UL,  196UL, 111UL, 196UL, 179UL, 69UL,
+    216UL, 130UL, 172UL, 41UL,  40UL,  52UL,  216UL, 221UL, 237UL, 195UL, 98UL,  165UL, 57UL,  254UL, 100UL, 193UL,
+    59UL,  16UL,  60UL,  172UL, 243UL, 98UL,  38UL,  122UL, 132UL, 162UL, 246UL, 133UL, 206UL, 192UL, 205UL, 13UL,
+    157UL, 151UL, 67UL,  69UL,  128UL, 49UL,  128UL, 8UL,   218UL, 188UL, 36UL,  203UL, 133UL, 196UL, 19UL,  66UL,
+    162UL, 3UL,   230UL, 126UL, 151UL, 232UL, 53UL,  125UL, 123UL, 229UL, 128UL, 50UL,  66UL,  118UL, 244UL, 143UL,
+    90UL,  67UL,  113UL, 75UL,  68UL,  60UL,  81UL,  7UL,   103UL, 106UL, 216UL, 161UL, 41UL,  165UL, 242UL, 227UL,
+    25UL,  3UL,   104UL, 20UL,  18UL,  154UL, 38UL,  196UL, 253UL, 168UL, 22UL,  232UL, 55UL,  121UL, 198UL, 25UL,
+    30UL,  220UL, 170UL, 129UL, 70UL,  140UL, 178UL, 119UL, 162UL, 107UL, 126UL, 177UL, 140UL, 94UL,  137UL, 12UL,
+    59UL,  218UL, 0UL,   147UL, 188UL, 70UL,  227UL, 249UL, 211UL, 243UL, 15UL,  50UL,  250UL, 144UL, 53UL,  6UL,
+    176UL, 34UL,  67UL,  112UL, 10UL,  196UL, 238UL, 234UL, 200UL, 79UL,  30UL,  10UL,  221UL, 74UL,  55UL,  224UL,
+    63UL,  133UL, 51UL,  54UL,  201UL, 233UL, 106UL, 121UL, 126UL, 103UL, 128UL, 220UL, 222UL, 35UL,  192UL, 206UL,
+    178UL, 215UL, 146UL, 83UL,  195UL, 60UL,  245UL, 82UL,  241UL, 85UL,  98UL,  66UL,  238UL, 8UL,   211UL, 136UL,
+    152UL, 108UL, 46UL,  16UL,  140UL, 204UL, 213UL, 194UL, 139UL, 88UL,  142UL, 56UL,  104UL, 224UL, 148UL, 107UL,
+    203UL, 178UL, 193UL, 65UL,  110UL, 206UL, 106UL, 137UL, 132UL, 12UL,  183UL, 161UL, 204UL, 53UL,  96UL,  218UL,
+    61UL,  117UL, 163UL, 155UL, 250UL, 180UL, 171UL, 115UL, 185UL, 111UL, 125UL, 1UL,   118UL, 138UL, 217UL, 197UL,
+    36UL,  71UL,  48UL,  152UL, 131UL, 38UL,  113UL, 128UL, 107UL, 253UL, 46UL,  60UL,  149UL, 242UL, 85UL,  141UL,
+    242UL, 68UL,  17UL,  127UL, 86UL,  219UL, 88UL,  150UL, 52UL,  249UL, 112UL, 121UL, 184UL, 141UL, 70UL,  64UL,
+    84UL,  211UL, 205UL, 116UL, 241UL, 165UL, 106UL, 113UL, 94UL,  165UL, 213UL, 122UL, 189UL, 255UL, 48UL,  115UL,
+    155UL, 218UL, 42UL,  122UL, 4UL,   197UL, 91UL,  75UL,  71UL,  16UL,  43UL,  249UL, 58UL,  76UL,  8UL,   223UL,
+    188UL, 14UL,  111UL, 93UL,  210UL, 45UL,  212UL, 205UL, 211UL, 255UL, 220UL, 60UL,  112UL, 20UL,  78UL,  192UL,
+    79UL,  155UL, 160UL, 102UL, 10UL,  195UL, 195UL, 181UL, 229UL, 191UL, 124UL, 205UL, 105UL, 93UL,  55UL,  104UL,
+    25UL,  12UL,  56UL,  42UL,  237UL, 228UL, 42UL,  91UL,  190UL, 1UL,   81UL,  148UL, 54UL,  69UL,  98UL,  192UL,
+    185UL, 26UL,  1UL,   79UL,  160UL, 27UL,  99UL,  20UL,  120UL, 135UL, 142UL, 195UL, 228UL, 176UL, 63UL,  213UL,
+    200UL, 153UL, 212UL, 82UL,  213UL, 61UL,  244UL, 12UL,  40UL,  160UL, 89UL,  146UL, 206UL, 222UL, 214UL, 173UL,
+    92UL,  74UL,  28UL,  198UL, 124UL, 21UL,  116UL, 122UL, 241UL, 37UL,  135UL, 61UL,  62UL,  204UL, 242UL, 33UL,
+    214UL, 149UL, 14UL,  63UL,  186UL, 17UL,  18UL,  203UL, 90UL,  110UL, 71UL,  72UL,  188UL, 200UL, 84UL,  144UL,
+    72UL,  146UL, 234UL, 97UL,  251UL, 61UL,  219UL, 223UL, 79UL,  135UL, 113UL, 109UL, 153UL, 11UL,  149UL, 175UL,
+    51UL,  79UL,  212UL, 24UL,  107UL, 1UL,   139UL, 52UL,  242UL, 135UL, 93UL,  92UL,  4UL,   112UL, 91UL,  213UL,
+    126UL, 207UL, 241UL, 54UL,  142UL, 96UL,  105UL, 3UL,   88UL,  92UL,  172UL, 202UL, 190UL, 18UL,  179UL, 96UL,
+    163UL, 254UL, 44UL,  248UL, 6UL,   93UL,  134UL, 138UL, 70UL,  213UL, 124UL, 123UL, 107UL, 105UL, 222UL, 11UL,
+    206UL, 40UL,  47UL,  172UL, 161UL, 89UL,  97UL,  173UL, 181UL, 111UL, 165UL, 85UL,  73UL,  16UL,  28UL,  7UL,
+    28UL,  122UL, 9UL,   128UL, 85UL,  143UL, 253UL, 129UL, 29UL,  240UL, 76UL,  182UL, 171UL, 33UL,  33UL,  134UL,
+    214UL, 231UL, 37UL,  2UL,   17UL,  25UL,  33UL,  175UL, 220UL, 108UL, 249UL, 120UL, 124UL, 53UL,  14UL,  193UL,
+    246UL, 246UL, 50UL,  83UL,  22UL,  74UL,  94UL,  79UL,  189UL, 59UL,  115UL, 82UL,  77UL,  166UL, 56UL,  185UL,
+    235UL, 69UL,  198UL, 202UL, 41UL,  198UL, 214UL, 102UL, 57UL,  176UL, 242UL, 238UL, 174UL, 112UL, 129UL, 18UL,
+    94UL,  78UL,  124UL, 149UL, 98UL,  230UL, 188UL, 235UL, 122UL, 162UL, 77UL,  127UL, 144UL, 61UL,  199UL, 20UL,
+    111UL, 101UL, 150UL, 108UL, 227UL, 142UL, 173UL, 143UL, 167UL, 210UL, 70UL,  188UL, 136UL, 108UL, 201UL, 184UL,
+    203UL, 66UL,  232UL, 94UL,  156UL, 247UL, 251UL, 150UL, 201UL, 170UL, 66UL,  224UL, 34UL,  119UL, 14UL,  160UL,
+    8UL,   95UL,  128UL, 44UL,  61UL,  112UL, 13UL,  125UL, 206UL, 248UL, 13UL,  82UL,  192UL, 93UL,  250UL, 227UL,
+    228UL, 221UL, 40UL,  236UL, 157UL, 168UL, 246UL, 245UL, 207UL, 248UL, 83UL,  166UL, 247UL, 67UL,  12UL,  92UL,
+    217UL, 115UL, 233UL, 179UL, 35UL,  204UL, 154UL, 125UL, 51UL,  120UL, 196UL, 168UL, 231UL, 32UL,  218UL, 35UL,
+    55UL,  224UL, 251UL, 141UL, 41UL,  219UL, 173UL, 225UL, 228UL, 247UL, 212UL, 179UL, 78UL,  114UL, 220UL, 136UL,
+    0UL,   252UL, 249UL, 241UL, 156UL, 205UL, 48UL,  227UL, 67UL,  213UL, 49UL,  100UL, 107UL, 70UL,  142UL, 70UL,
+    67UL,  216UL, 166UL, 72UL,  141UL, 38UL,  113UL, 1UL,   241UL, 16UL,  71UL,  252UL, 78UL,  126UL, 141UL, 43UL,
+    119UL, 9UL,   14UL,  123UL, 6UL,   156UL, 148UL, 81UL,  24UL,  120UL, 36UL,  123UL, 50UL,  205UL, 8UL,   18UL,
+    223UL, 132UL, 203UL, 157UL, 126UL, 137UL, 37UL,  78UL,  112UL, 175UL, 46UL,  69UL,  202UL, 133UL, 235UL, 144UL,
+    26UL,  119UL, 196UL, 252UL, 45UL,  249UL, 60UL,  143UL, 56UL,  224UL, 184UL, 8UL,   201UL, 180UL, 123UL, 122UL,
+    31UL,  216UL, 15UL,  224UL, 114UL, 222UL, 109UL, 75UL,  221UL, 170UL, 164UL, 31UL,  93UL,  241UL, 130UL, 96UL,
+    119UL, 70UL,  89UL,  30UL,  81UL,  239UL, 225UL, 228UL, 9UL,   128UL, 91UL,  59UL,  70UL,  253UL, 153UL, 224UL,
+    142UL, 38UL,  39UL,  152UL, 246UL, 47UL,  180UL, 200UL, 147UL, 147UL, 240UL, 208UL, 49UL,  57UL,  66UL,  222UL,
+    9UL,   217UL, 46UL,  59UL,  107UL, 156UL, 230UL, 86UL,  91UL,  174UL, 153UL, 157UL, 8UL,   5UL,   93UL,  163UL,
+    134UL, 86UL,  124UL, 199UL, 31UL,  174UL, 165UL, 14UL,  230UL, 9UL,   113UL, 161UL, 8UL,   230UL, 241UL, 149UL,
+    43UL,  46UL,  165UL, 136UL, 57UL,  91UL,  134UL, 227UL, 45UL,  7UL,   20UL,  198UL, 181UL, 38UL,  239UL, 125UL,
+    16UL,  183UL, 79UL,  202UL, 27UL,  156UL, 76UL,  104UL, 145UL, 204UL, 240UL, 151UL, 160UL, 244UL, 1UL,   117UL,
+    73UL,  209UL, 166UL, 64UL,  172UL, 99UL,  229UL, 5UL,   150UL, 248UL, 140UL, 212UL, 245UL, 54UL,  170UL, 254UL,
+    253UL, 56UL,  78UL,  182UL, 169UL, 240UL, 1UL,   135UL, 128UL, 123UL, 99UL,  81UL,  140UL, 231UL, 82UL,  221UL,
+    7UL,   95UL,  174UL, 230UL, 131UL, 229UL, 196UL, 129UL, 15UL,  199UL, 191UL, 45UL,  45UL,  206UL, 186UL, 234UL,
+    111UL, 8UL,   162UL, 211UL, 77UL,  15UL,  103UL, 106UL, 118UL, 206UL, 202UL, 62UL,  2UL,   3UL,   161UL, 204UL,
+    136UL, 47UL,  101UL, 25UL,  209UL, 168UL, 89UL,  60UL,  99UL,  15UL,  223UL, 181UL, 145UL, 16UL,  217UL, 254UL,
+    207UL, 173UL, 85UL,  10UL,  226UL, 2UL,   26UL,  80UL,  96UL,  74UL,  68UL,  122UL, 52UL,  186UL, 16UL,  115UL,
+};
+uint32_t rand_arr_9_b9_w32_arr[1024] = {
+    441UL, 495UL, 229UL, 362UL, 40UL,  307UL, 376UL, 172UL, 476UL, 432UL, 325UL, 182UL, 75UL,  257UL, 462UL, 460UL,
+    339UL, 472UL, 58UL,  238UL, 326UL, 24UL,  187UL, 302UL, 267UL, 121UL, 441UL, 491UL, 191UL, 309UL, 86UL,  189UL,
+    509UL, 428UL, 30UL,  64UL,  428UL, 377UL, 195UL, 134UL, 236UL, 316UL, 505UL, 439UL, 59UL,  245UL, 71UL,  148UL,
+    126UL, 109UL, 127UL, 234UL, 510UL, 368UL, 472UL, 212UL, 138UL, 268UL, 381UL, 399UL, 467UL, 193UL, 255UL, 511UL,
+    285UL, 315UL, 419UL, 64UL,  131UL, 458UL, 468UL, 351UL, 122UL, 337UL, 154UL, 60UL,  256UL, 50UL,  57UL,  162UL,
+    469UL, 418UL, 64UL,  462UL, 24UL,  399UL, 313UL, 185UL, 162UL, 25UL,  281UL, 109UL, 375UL, 308UL, 166UL, 471UL,
+    167UL, 123UL, 428UL, 135UL, 356UL, 445UL, 91UL,  32UL,  28UL,  181UL, 52UL,  496UL, 347UL, 317UL, 427UL, 492UL,
+    477UL, 7UL,   432UL, 114UL, 288UL, 489UL, 288UL, 319UL, 253UL, 341UL, 19UL,  62UL,  125UL, 16UL,  237UL, 34UL,
+    483UL, 283UL, 154UL, 318UL, 270UL, 503UL, 164UL, 100UL, 436UL, 318UL, 396UL, 420UL, 147UL, 268UL, 83UL,  458UL,
+    432UL, 192UL, 239UL, 84UL,  413UL, 173UL, 369UL, 162UL, 460UL, 226UL, 463UL, 80UL,  249UL, 15UL,  485UL, 439UL,
+    279UL, 263UL, 202UL, 99UL,  407UL, 217UL, 221UL, 312UL, 402UL, 312UL, 408UL, 339UL, 226UL, 313UL, 116UL, 100UL,
+    415UL, 81UL,  470UL, 85UL,  314UL, 431UL, 348UL, 449UL, 428UL, 126UL, 368UL, 438UL, 455UL, 376UL, 483UL, 156UL,
+    429UL, 414UL, 403UL, 356UL, 71UL,  471UL, 166UL, 196UL, 396UL, 118UL, 179UL, 152UL, 378UL, 422UL, 464UL, 368UL,
+    36UL,  272UL, 187UL, 73UL,  143UL, 361UL, 62UL,  7UL,   257UL, 340UL, 372UL, 66UL,  478UL, 482UL, 165UL, 6UL,
+    89UL,  181UL, 500UL, 260UL, 200UL, 333UL, 472UL, 3UL,   457UL, 73UL,  70UL,  284UL, 315UL, 120UL, 481UL, 247UL,
+    325UL, 246UL, 144UL, 153UL, 231UL, 353UL, 328UL, 406UL, 164UL, 502UL, 59UL,  326UL, 188UL, 218UL, 369UL, 57UL,
+    312UL, 141UL, 395UL, 86UL,  29UL,  248UL, 31UL,  387UL, 309UL, 356UL, 227UL, 57UL,  394UL, 185UL, 167UL, 131UL,
+    487UL, 365UL, 71UL,  367UL, 502UL, 100UL, 498UL, 427UL, 433UL, 63UL,  142UL, 134UL, 464UL, 293UL, 285UL, 104UL,
+    292UL, 209UL, 180UL, 501UL, 476UL, 205UL, 24UL,  491UL, 188UL, 116UL, 225UL, 107UL, 362UL, 328UL, 257UL, 36UL,
+    397UL, 487UL, 109UL, 146UL, 323UL, 254UL, 495UL, 172UL, 333UL, 50UL,  260UL, 264UL, 69UL,  128UL, 331UL, 43UL,
+    408UL, 229UL, 454UL, 35UL,  310UL, 5UL,   54UL,  379UL, 222UL, 92UL,  299UL, 94UL,  80UL,  294UL, 472UL, 457UL,
+    450UL, 353UL, 280UL, 511UL, 483UL, 420UL, 287UL, 47UL,  117UL, 368UL, 213UL, 423UL, 167UL, 328UL, 313UL, 190UL,
+    288UL, 20UL,  350UL, 478UL, 49UL,  99UL,  136UL, 438UL, 461UL, 269UL, 115UL, 235UL, 334UL, 378UL, 4UL,   119UL,
+    113UL, 49UL,  111UL, 441UL, 382UL, 110UL, 47UL,  325UL, 54UL,  67UL,  259UL, 297UL, 30UL,  480UL, 168UL, 334UL,
+    457UL, 254UL, 454UL, 107UL, 154UL, 27UL,  508UL, 91UL,  262UL, 384UL, 429UL, 143UL, 264UL, 335UL, 94UL,  176UL,
+    143UL, 424UL, 148UL, 419UL, 374UL, 398UL, 429UL, 175UL, 461UL, 87UL,  433UL, 184UL, 251UL, 0UL,   367UL, 363UL,
+    212UL, 466UL, 377UL, 205UL, 338UL, 358UL, 64UL,  68UL,  358UL, 175UL, 274UL, 432UL, 143UL, 293UL, 352UL, 264UL,
+    354UL, 234UL, 121UL, 458UL, 357UL, 315UL, 102UL, 98UL,  306UL, 295UL, 24UL,  372UL, 479UL, 45UL,  271UL, 440UL,
+    170UL, 396UL, 264UL, 442UL, 459UL, 437UL, 140UL, 302UL, 210UL, 65UL,  436UL, 159UL, 441UL, 244UL, 487UL, 21UL,
+    442UL, 70UL,  401UL, 154UL, 363UL, 419UL, 474UL, 130UL, 217UL, 143UL, 45UL,  136UL, 313UL, 299UL, 352UL, 83UL,
+    393UL, 511UL, 272UL, 416UL, 488UL, 253UL, 67UL,  484UL, 469UL, 312UL, 44UL,  340UL, 273UL, 424UL, 322UL, 170UL,
+    299UL, 301UL, 123UL, 372UL, 148UL, 379UL, 487UL, 284UL, 416UL, 382UL, 29UL,  17UL,  81UL,  240UL, 217UL, 312UL,
+    509UL, 267UL, 71UL,  499UL, 255UL, 264UL, 417UL, 399UL, 246UL, 118UL, 75UL,  72UL,  416UL, 324UL, 86UL,  460UL,
+    75UL,  335UL, 77UL,  43UL,  345UL, 389UL, 302UL, 386UL, 188UL, 260UL, 181UL, 452UL, 366UL, 207UL, 407UL, 195UL,
+    383UL, 246UL, 6UL,   340UL, 424UL, 287UL, 137UL, 127UL, 502UL, 332UL, 142UL, 266UL, 170UL, 488UL, 289UL, 61UL,
+    484UL, 8UL,   508UL, 252UL, 160UL, 458UL, 153UL, 303UL, 447UL, 243UL, 337UL, 114UL, 121UL, 368UL, 291UL, 220UL,
+    324UL, 302UL, 503UL, 247UL, 34UL,  285UL, 501UL, 92UL,  127UL, 460UL, 14UL,  103UL, 66UL,  418UL, 191UL, 150UL,
+    298UL, 89UL,  191UL, 175UL, 408UL, 24UL,  129UL, 14UL,  83UL,  22UL,  250UL, 72UL,  121UL, 246UL, 140UL, 406UL,
+    363UL, 435UL, 18UL,  219UL, 173UL, 306UL, 485UL, 328UL, 384UL, 276UL, 291UL, 255UL, 333UL, 375UL, 125UL, 169UL,
+    343UL, 417UL, 51UL,  452UL, 490UL, 91UL,  379UL, 183UL, 92UL,  62UL,  197UL, 370UL, 394UL, 133UL, 359UL, 271UL,
+    111UL, 95UL,  240UL, 108UL, 31UL,  109UL, 170UL, 431UL, 255UL, 23UL,  355UL, 248UL, 87UL,  350UL, 55UL,  107UL,
+    104UL, 279UL, 49UL,  212UL, 305UL, 70UL,  15UL,  74UL,  190UL, 361UL, 435UL, 23UL,  120UL, 62UL,  332UL, 7UL,
+    441UL, 35UL,  148UL, 272UL, 103UL, 188UL, 136UL, 283UL, 137UL, 465UL, 272UL, 115UL, 494UL, 222UL, 202UL, 98UL,
+    433UL, 42UL,  152UL, 320UL, 204UL, 355UL, 14UL,  422UL, 345UL, 319UL, 172UL, 273UL, 241UL, 143UL, 31UL,  413UL,
+    257UL, 184UL, 475UL, 265UL, 290UL, 155UL, 503UL, 236UL, 374UL, 490UL, 430UL, 124UL, 318UL, 75UL,  72UL,  156UL,
+    275UL, 186UL, 246UL, 25UL,  370UL, 492UL, 451UL, 251UL, 241UL, 167UL, 59UL,  202UL, 339UL, 9UL,   141UL, 120UL,
+    511UL, 327UL, 137UL, 114UL, 267UL, 378UL, 162UL, 159UL, 426UL, 4UL,   93UL,  195UL, 214UL, 22UL,  62UL,  319UL,
+    458UL, 163UL, 9UL,   61UL,  164UL, 7UL,   2UL,   508UL, 50UL,  77UL,  496UL, 65UL,  111UL, 299UL, 239UL, 339UL,
+    364UL, 344UL, 473UL, 415UL, 93UL,  282UL, 139UL, 217UL, 234UL, 393UL, 129UL, 305UL, 223UL, 13UL,  120UL, 496UL,
+    218UL, 339UL, 139UL, 72UL,  319UL, 21UL,  117UL, 164UL, 177UL, 152UL, 202UL, 506UL, 190UL, 171UL, 430UL, 168UL,
+    117UL, 178UL, 434UL, 509UL, 472UL, 95UL,  325UL, 82UL,  324UL, 446UL, 142UL, 100UL, 137UL, 474UL, 316UL, 403UL,
+    176UL, 381UL, 27UL,  186UL, 140UL, 243UL, 398UL, 257UL, 56UL,  256UL, 317UL, 298UL, 80UL,  348UL, 145UL, 2UL,
+    71UL,  418UL, 245UL, 19UL,  325UL, 272UL, 203UL, 405UL, 14UL,  7UL,   393UL, 4UL,   358UL, 124UL, 185UL, 424UL,
+    88UL,  260UL, 457UL, 141UL, 415UL, 395UL, 447UL, 399UL, 151UL, 333UL, 56UL,  403UL, 143UL, 301UL, 51UL,  334UL,
+    267UL, 251UL, 256UL, 346UL, 388UL, 490UL, 400UL, 304UL, 218UL, 433UL, 313UL, 24UL,  214UL, 135UL, 95UL,  99UL,
+    466UL, 54UL,  368UL, 123UL, 238UL, 22UL,  170UL, 23UL,  315UL, 248UL, 188UL, 187UL, 345UL, 189UL, 460UL, 176UL,
+    442UL, 183UL, 336UL, 438UL, 434UL, 76UL,  49UL,  80UL,  114UL, 470UL, 81UL,  27UL,  201UL, 183UL, 251UL, 181UL,
+    21UL,  444UL, 352UL, 187UL, 160UL, 0UL,   417UL, 187UL, 175UL, 111UL, 407UL, 52UL,  278UL, 251UL, 326UL, 461UL,
+    480UL, 19UL,  420UL, 384UL, 480UL, 110UL, 262UL, 441UL, 471UL, 309UL, 335UL, 245UL, 329UL, 422UL, 376UL, 223UL,
+    270UL, 287UL, 59UL,  21UL,  299UL, 420UL, 144UL, 274UL, 13UL,  426UL, 474UL, 261UL, 154UL, 39UL,  370UL, 246UL,
+    124UL, 232UL, 147UL, 469UL, 345UL, 110UL, 247UL, 387UL, 322UL, 22UL,  137UL, 448UL, 92UL,  237UL, 117UL, 101UL,
+    23UL,  218UL, 31UL,  75UL,  211UL, 509UL, 89UL,  470UL, 242UL, 347UL, 483UL, 240UL, 168UL, 326UL, 44UL,  116UL,
+    3UL,   480UL, 335UL, 433UL, 43UL,  363UL, 81UL,  270UL, 264UL, 345UL, 145UL, 489UL, 82UL,  123UL, 415UL, 227UL,
+    151UL, 190UL, 473UL, 426UL, 261UL, 80UL,  14UL,  494UL, 160UL, 19UL,  336UL, 373UL, 147UL, 133UL, 107UL, 151UL,
+};
+uint32_t rand_arr_10_b10_w32_arr[1024] = {
+    928UL,  300UL, 244UL,  895UL, 229UL,  229UL,  132UL,  698UL, 340UL,  756UL,  787UL,  842UL,  547UL,  611UL,  33UL,
+    439UL,  269UL, 486UL,  163UL, 938UL,  729UL,  785UL,  614UL, 964UL,  52UL,   802UL,  887UL,  730UL,  1009UL, 847UL,
+    776UL,  559UL, 28UL,   18UL,  616UL,  1002UL, 510UL,  480UL, 135UL,  435UL,  427UL,  120UL,  668UL,  883UL,  233UL,
+    285UL,  571UL, 409UL,  293UL, 44UL,   542UL,  180UL,  728UL, 625UL,  702UL,  427UL,  171UL,  33UL,   494UL,  1021UL,
+    628UL,  24UL,  873UL,  371UL, 180UL,  505UL,  17UL,   44UL,  324UL,  60UL,   472UL,  946UL,  613UL,  745UL,  989UL,
+    1010UL, 470UL, 16UL,   302UL, 374UL,  516UL,  652UL,  609UL, 728UL,  991UL,  928UL,  26UL,   938UL,  725UL,  985UL,
+    553UL,  710UL, 10UL,   701UL, 376UL,  242UL,  342UL,  693UL, 694UL,  352UL,  633UL,  913UL,  287UL,  317UL,  555UL,
+    69UL,   120UL, 95UL,   852UL, 709UL,  846UL,  297UL,  989UL, 931UL,  846UL,  814UL,  7UL,    95UL,   269UL,  229UL,
+    539UL,  387UL, 134UL,  956UL, 37UL,   563UL,  393UL,  410UL, 634UL,  594UL,  207UL,  923UL,  329UL,  382UL,  416UL,
+    420UL,  100UL, 367UL,  659UL, 5UL,    7UL,    993UL,  695UL, 544UL,  644UL,  786UL,  166UL,  984UL,  823UL,  775UL,
+    961UL,  228UL, 297UL,  969UL, 937UL,  867UL,  359UL,  912UL, 922UL,  195UL,  609UL,  488UL,  73UL,   499UL,  188UL,
+    531UL,  135UL, 371UL,  337UL, 923UL,  791UL,  227UL,  332UL, 312UL,  274UL,  853UL,  513UL,  723UL,  540UL,  758UL,
+    311UL,  770UL, 221UL,  198UL, 276UL,  595UL,  364UL,  826UL, 17UL,   951UL,  788UL,  76UL,   135UL,  605UL,  740UL,
+    272UL,  39UL,  319UL,  259UL, 582UL,  589UL,  10UL,   130UL, 384UL,  602UL,  186UL,  458UL,  922UL,  514UL,  788UL,
+    337UL,  897UL, 827UL,  313UL, 511UL,  107UL,  995UL,  658UL, 9UL,    673UL,  110UL,  750UL,  782UL,  626UL,  837UL,
+    22UL,   400UL, 976UL,  884UL, 114UL,  978UL,  716UL,  198UL, 348UL,  887UL,  853UL,  657UL,  167UL,  599UL,  329UL,
+    664UL,  212UL, 56UL,   845UL, 818UL,  186UL,  198UL,  661UL, 108UL,  21UL,   663UL,  836UL,  230UL,  667UL,  39UL,
+    597UL,  32UL,  452UL,  668UL, 577UL,  546UL,  771UL,  598UL, 911UL,  4UL,    414UL,  304UL,  66UL,   319UL,  438UL,
+    350UL,  467UL, 463UL,  668UL, 295UL,  907UL,  414UL,  587UL, 931UL,  816UL,  629UL,  795UL,  1022UL, 129UL,  200UL,
+    490UL,  770UL, 185UL,  287UL, 769UL,  837UL,  90UL,   291UL, 532UL,  760UL,  435UL,  266UL,  840UL,  186UL,  340UL,
+    499UL,  158UL, 298UL,  297UL, 271UL,  133UL,  364UL,  6UL,   969UL,  786UL,  805UL,  980UL,  501UL,  932UL,  653UL,
+    263UL,  389UL, 751UL,  522UL, 238UL,  698UL,  576UL,  213UL, 162UL,  212UL,  829UL,  428UL,  610UL,  274UL,  484UL,
+    579UL,  19UL,  374UL,  570UL, 53UL,   497UL,  605UL,  831UL, 493UL,  962UL,  194UL,  685UL,  837UL,  324UL,  870UL,
+    193UL,  400UL, 554UL,  116UL, 717UL,  934UL,  197UL,  408UL, 602UL,  453UL,  978UL,  688UL,  395UL,  301UL,  700UL,
+    625UL,  212UL, 822UL,  374UL, 892UL,  693UL,  1004UL, 148UL, 76UL,   338UL,  294UL,  716UL,  456UL,  170UL,  549UL,
+    831UL,  718UL, 527UL,  751UL, 292UL,  547UL,  207UL,  893UL, 690UL,  354UL,  471UL,  260UL,  342UL,  809UL,  183UL,
+    17UL,   32UL,  999UL,  634UL, 868UL,  716UL,  834UL,  276UL, 850UL,  1014UL, 748UL,  982UL,  496UL,  97UL,   368UL,
+    68UL,   690UL, 622UL,  178UL, 170UL,  160UL,  971UL,  378UL, 709UL,  227UL,  920UL,  243UL,  568UL,  344UL,  235UL,
+    873UL,  550UL, 13UL,   47UL,  672UL,  413UL,  982UL,  638UL, 178UL,  877UL,  402UL,  763UL,  173UL,  607UL,  715UL,
+    211UL,  520UL, 355UL,  455UL, 243UL,  650UL,  344UL,  15UL,  292UL,  973UL,  573UL,  874UL,  861UL,  309UL,  436UL,
+    953UL,  853UL, 445UL,  440UL, 386UL,  718UL,  967UL,  127UL, 966UL,  845UL,  302UL,  556UL,  4UL,    619UL,  1002UL,
+    830UL,  453UL, 781UL,  310UL, 20UL,   838UL,  696UL,  404UL, 215UL,  480UL,  485UL,  745UL,  771UL,  20UL,   314UL,
+    298UL,  828UL, 378UL,  662UL, 258UL,  299UL,  432UL,  81UL,  518UL,  841UL,  295UL,  553UL,  238UL,  66UL,   621UL,
+    422UL,  222UL, 952UL,  522UL, 417UL,  1013UL, 574UL,  654UL, 55UL,   666UL,  611UL,  1013UL, 450UL,  544UL,  865UL,
+    100UL,  801UL, 380UL,  126UL, 92UL,   647UL,  865UL,  555UL, 232UL,  917UL,  985UL,  270UL,  509UL,  469UL,  1003UL,
+    560UL,  997UL, 724UL,  667UL, 25UL,   234UL,  830UL,  10UL,  399UL,  340UL,  324UL,  825UL,  222UL,  975UL,  723UL,
+    292UL,  51UL,  166UL,  525UL, 769UL,  452UL,  104UL,  503UL, 380UL,  620UL,  573UL,  750UL,  396UL,  763UL,  749UL,
+    735UL,  380UL, 996UL,  806UL, 468UL,  1009UL, 466UL,  440UL, 83UL,   721UL,  890UL,  564UL,  319UL,  469UL,  210UL,
+    140UL,  58UL,  811UL,  226UL, 673UL,  424UL,  955UL,  568UL, 967UL,  825UL,  874UL,  727UL,  407UL,  83UL,   918UL,
+    548UL,  58UL,  605UL,  277UL, 939UL,  803UL,  866UL,  957UL, 555UL,  476UL,  1010UL, 121UL,  819UL,  369UL,  289UL,
+    144UL,  158UL, 643UL,  841UL, 132UL,  70UL,   635UL,  397UL, 622UL,  292UL,  146UL,  891UL,  251UL,  269UL,  393UL,
+    299UL,  650UL, 363UL,  163UL, 671UL,  741UL,  749UL,  212UL, 736UL,  474UL,  586UL,  143UL,  855UL,  908UL,  311UL,
+    809UL,  865UL, 646UL,  48UL,  879UL,  198UL,  815UL,  723UL, 391UL,  961UL,  46UL,   340UL,  689UL,  677UL,  250UL,
+    832UL,  839UL, 829UL,  172UL, 109UL,  302UL,  920UL,  62UL,  322UL,  855UL,  912UL,  492UL,  149UL,  191UL,  695UL,
+    16UL,   756UL, 347UL,  908UL, 745UL,  137UL,  387UL,  421UL, 126UL,  860UL,  802UL,  573UL,  375UL,  997UL,  715UL,
+    778UL,  50UL,  339UL,  447UL, 369UL,  152UL,  233UL,  434UL, 12UL,   631UL,  413UL,  964UL,  884UL,  63UL,   77UL,
+    826UL,  458UL, 645UL,  399UL, 367UL,  65UL,   534UL,  791UL, 457UL,  838UL,  477UL,  795UL,  461UL,  344UL,  624UL,
+    918UL,  474UL, 990UL,  17UL,  642UL,  444UL,  486UL,  19UL,  942UL,  328UL,  734UL,  580UL,  67UL,   640UL,  3UL,
+    7UL,    927UL, 123UL,  804UL, 164UL,  44UL,   945UL,  315UL, 349UL,  905UL,  908UL,  230UL,  259UL,  294UL,  853UL,
+    118UL,  755UL, 728UL,  189UL, 904UL,  963UL,  200UL,  18UL,  308UL,  42UL,   288UL,  296UL,  51UL,   7UL,    983UL,
+    485UL,  108UL, 845UL,  558UL, 570UL,  135UL,  296UL,  850UL, 814UL,  379UL,  1015UL, 528UL,  738UL,  138UL,  575UL,
+    287UL,  547UL, 479UL,  272UL, 809UL,  761UL,  333UL,  487UL, 306UL,  757UL,  152UL,  361UL,  235UL,  965UL,  671UL,
+    113UL,  899UL, 420UL,  637UL, 688UL,  79UL,   905UL,  634UL, 953UL,  867UL,  974UL,  491UL,  177UL,  616UL,  506UL,
+    346UL,  618UL, 319UL,  911UL, 419UL,  892UL,  749UL,  20UL,  844UL,  337UL,  65UL,   815UL,  105UL,  492UL,  550UL,
+    542UL,  82UL,  64UL,   205UL, 754UL,  404UL,  257UL,  471UL, 244UL,  532UL,  411UL,  265UL,  722UL,  1020UL, 701UL,
+    925UL,  230UL, 835UL,  943UL, 875UL,  763UL,  397UL,  419UL, 695UL,  703UL,  496UL,  111UL,  174UL,  550UL,  339UL,
+    893UL,  851UL, 832UL,  968UL, 63UL,   547UL,  349UL,  467UL, 198UL,  257UL,  109UL,  710UL,  289UL,  800UL,  812UL,
+    516UL,  456UL, 585UL,  326UL, 488UL,  255UL,  450UL,  34UL,  64UL,   459UL,  623UL,  14UL,   417UL,  696UL,  917UL,
+    154UL,  266UL, 510UL,  210UL, 518UL,  934UL,  294UL,  911UL, 1023UL, 249UL,  488UL,  994UL,  2UL,    717UL,  603UL,
+    517UL,  459UL, 757UL,  940UL, 476UL,  825UL,  236UL,  511UL, 908UL,  83UL,   746UL,  29UL,   251UL,  749UL,  893UL,
+    576UL,  450UL, 351UL,  194UL, 1004UL, 717UL,  18UL,   147UL, 443UL,  399UL,  70UL,   785UL,  697UL,  1013UL, 547UL,
+    988UL,  851UL, 837UL,  199UL, 365UL,  584UL,  306UL,  256UL, 216UL,  150UL,  60UL,   76UL,   812UL,  604UL,  645UL,
+    4UL,    987UL, 131UL,  513UL, 337UL,  524UL,  692UL,  781UL, 115UL,  596UL,  659UL,  550UL,  898UL,  824UL,  259UL,
+    387UL,  929UL, 1022UL, 606UL, 380UL,  42UL,   392UL,  836UL, 1012UL, 187UL,  401UL,  510UL,  53UL,   538UL,  103UL,
+    227UL,  64UL,  333UL,  654UL, 716UL,  729UL,  365UL,  259UL, 951UL,  397UL,  50UL,   689UL,  621UL,  442UL,  52UL,
+    708UL,  596UL, 833UL,  98UL,  511UL,  785UL,  205UL,  49UL,  891UL,  634UL,  16UL,   307UL,  825UL,  348UL,  288UL,
+    873UL,  884UL, 798UL,  723UL, 903UL,  1022UL, 670UL,  803UL, 709UL,  459UL,  594UL,  871UL,  9UL,    808UL,  723UL,
+    535UL,  815UL, 696UL,  955UL, 715UL,  942UL,  59UL,   183UL, 917UL,  544UL,  726UL,  562UL,  649UL,  361UL,  863UL,
+    700UL,  828UL, 550UL,  828UL,
+};
+uint32_t rand_arr_11_b11_w32_arr[1024] = {
+    455UL,  1083UL, 678UL,  1075UL, 1646UL, 160UL,  415UL,  1084UL, 1015UL, 781UL,  395UL,  1932UL, 1474UL, 1595UL,
+    634UL,  791UL,  1645UL, 367UL,  1078UL, 1913UL, 801UL,  230UL,  1326UL, 1949UL, 1402UL, 202UL,  1179UL, 1925UL,
+    127UL,  969UL,  1342UL, 1165UL, 606UL,  24UL,   736UL,  265UL,  782UL,  1174UL, 1807UL, 840UL,  801UL,  1934UL,
+    170UL,  1102UL, 1108UL, 1287UL, 1679UL, 77UL,   1902UL, 326UL,  1043UL, 1645UL, 948UL,  1208UL, 828UL,  311UL,
+    399UL,  752UL,  294UL,  1806UL, 1946UL, 1666UL, 1013UL, 2042UL, 1238UL, 1567UL, 268UL,  856UL,  1342UL, 1990UL,
+    1693UL, 1480UL, 83UL,   332UL,  616UL,  865UL,  1347UL, 497UL,  1258UL, 1564UL, 743UL,  21UL,   1066UL, 383UL,
+    927UL,  1913UL, 108UL,  1906UL, 406UL,  1209UL, 1155UL, 182UL,  1405UL, 313UL,  222UL,  664UL,  1606UL, 1275UL,
+    1192UL, 1555UL, 226UL,  1740UL, 805UL,  2032UL, 898UL,  133UL,  253UL,  958UL,  678UL,  562UL,  1654UL, 1571UL,
+    1721UL, 1644UL, 845UL,  653UL,  431UL,  661UL,  1077UL, 704UL,  1569UL, 1265UL, 1926UL, 307UL,  1394UL, 1227UL,
+    1394UL, 376UL,  1808UL, 2015UL, 1875UL, 745UL,  490UL,  456UL,  197UL,  749UL,  1031UL, 694UL,  255UL,  133UL,
+    339UL,  755UL,  635UL,  218UL,  900UL,  1922UL, 1319UL, 1222UL, 337UL,  1654UL, 1607UL, 1052UL, 1276UL, 1363UL,
+    1007UL, 146UL,  47UL,   2040UL, 1934UL, 2021UL, 1404UL, 29UL,   62UL,   996UL,  244UL,  213UL,  336UL,  932UL,
+    2013UL, 1357UL, 1764UL, 951UL,  171UL,  849UL,  2037UL, 1133UL, 679UL,  709UL,  265UL,  1315UL, 755UL,  1128UL,
+    1845UL, 322UL,  1408UL, 1106UL, 1051UL, 475UL,  1876UL, 503UL,  1437UL, 301UL,  1373UL, 432UL,  1600UL, 1399UL,
+    2010UL, 1444UL, 237UL,  1677UL, 865UL,  670UL,  1796UL, 1973UL, 444UL,  323UL,  1109UL, 1775UL, 1280UL, 1582UL,
+    1331UL, 1191UL, 480UL,  1989UL, 1171UL, 287UL,  1805UL, 1754UL, 1680UL, 51UL,   1174UL, 1216UL, 30UL,   116UL,
+    962UL,  428UL,  1108UL, 532UL,  1433UL, 1187UL, 1333UL, 657UL,  197UL,  963UL,  1564UL, 672UL,  1038UL, 2008UL,
+    1814UL, 1178UL, 838UL,  977UL,  749UL,  485UL,  1704UL, 1541UL, 690UL,  38UL,   959UL,  608UL,  456UL,  1146UL,
+    289UL,  932UL,  390UL,  1709UL, 1503UL, 1805UL, 1298UL, 381UL,  1437UL, 1316UL, 1871UL, 302UL,  1690UL, 33UL,
+    1774UL, 118UL,  569UL,  32UL,   1225UL, 1670UL, 1340UL, 538UL,  169UL,  1655UL, 620UL,  109UL,  602UL,  402UL,
+    1278UL, 1956UL, 644UL,  1493UL, 1239UL, 132UL,  1307UL, 118UL,  1766UL, 1248UL, 691UL,  288UL,  475UL,  445UL,
+    1380UL, 1944UL, 1800UL, 789UL,  51UL,   985UL,  90UL,   423UL,  20UL,   1086UL, 1197UL, 1412UL, 620UL,  674UL,
+    1914UL, 609UL,  1375UL, 1371UL, 1227UL, 1821UL, 1904UL, 1064UL, 532UL,  108UL,  1745UL, 2028UL, 160UL,  335UL,
+    629UL,  1373UL, 1232UL, 1369UL, 277UL,  2021UL, 1191UL, 1846UL, 349UL,  878UL,  383UL,  160UL,  545UL,  1682UL,
+    1713UL, 1018UL, 1346UL, 1743UL, 1133UL, 78UL,   37UL,   11UL,   860UL,  336UL,  766UL,  971UL,  1381UL, 452UL,
+    1558UL, 1899UL, 1999UL, 1550UL, 458UL,  460UL,  421UL,  395UL,  945UL,  1633UL, 1161UL, 134UL,  877UL,  574UL,
+    1728UL, 1107UL, 1974UL, 1762UL, 1465UL, 413UL,  1372UL, 308UL,  724UL,  423UL,  1904UL, 24UL,   954UL,  1073UL,
+    934UL,  54UL,   415UL,  632UL,  1542UL, 2009UL, 1344UL, 752UL,  1141UL, 2010UL, 341UL,  335UL,  1051UL, 1707UL,
+    1812UL, 1050UL, 1348UL, 679UL,  1028UL, 1394UL, 1020UL, 1642UL, 1681UL, 1734UL, 54UL,   1550UL, 1302UL, 1726UL,
+    1157UL, 505UL,  89UL,   496UL,  49UL,   534UL,  1764UL, 1963UL, 1080UL, 1255UL, 1967UL, 16UL,   799UL,  1994UL,
+    1117UL, 97UL,   1903UL, 1807UL, 557UL,  268UL,  21UL,   892UL,  2043UL, 1704UL, 1885UL, 1583UL, 618UL,  1060UL,
+    1843UL, 1088UL, 1288UL, 942UL,  1478UL, 1433UL, 173UL,  575UL,  1034UL, 1969UL, 218UL,  805UL,  1743UL, 456UL,
+    303UL,  623UL,  1468UL, 1655UL, 1345UL, 1098UL, 337UL,  701UL,  1459UL, 16UL,   1194UL, 1648UL, 676UL,  1460UL,
+    399UL,  516UL,  262UL,  1899UL, 1526UL, 978UL,  324UL,  1367UL, 1114UL, 27UL,   1434UL, 453UL,  903UL,  233UL,
+    1551UL, 706UL,  267UL,  1206UL, 764UL,  870UL,  1841UL, 1080UL, 1208UL, 841UL,  994UL,  98UL,   1065UL, 1305UL,
+    1664UL, 209UL,  232UL,  601UL,  159UL,  890UL,  1102UL, 1760UL, 1136UL, 909UL,  836UL,  1775UL, 1533UL, 548UL,
+    877UL,  485UL,  1278UL, 1657UL, 1914UL, 1801UL, 1427UL, 291UL,  1698UL, 1720UL, 720UL,  1182UL, 1864UL, 1819UL,
+    956UL,  1838UL, 887UL,  1092UL, 954UL,  1230UL, 144UL,  1701UL, 685UL,  965UL,  1998UL, 1735UL, 2013UL, 1411UL,
+    886UL,  1270UL, 134UL,  1930UL, 355UL,  1179UL, 1975UL, 344UL,  1479UL, 1962UL, 562UL,  1029UL, 376UL,  1308UL,
+    1900UL, 829UL,  639UL,  1758UL, 1917UL, 1707UL, 698UL,  1355UL, 613UL,  942UL,  1197UL, 865UL,  1905UL, 836UL,
+    1656UL, 1826UL, 1507UL, 966UL,  1584UL, 1219UL, 1951UL, 1179UL, 466UL,  1733UL, 85UL,   1394UL, 803UL,  1546UL,
+    1681UL, 650UL,  898UL,  1516UL, 488UL,  1592UL, 1155UL, 1055UL, 1369UL, 524UL,  686UL,  490UL,  1391UL, 198UL,
+    1314UL, 7UL,    758UL,  395UL,  157UL,  987UL,  732UL,  1197UL, 1987UL, 1275UL, 1445UL, 376UL,  704UL,  330UL,
+    1841UL, 965UL,  428UL,  49UL,   1960UL, 1001UL, 46UL,   1161UL, 955UL,  1408UL, 1937UL, 1240UL, 819UL,  609UL,
+    1261UL, 475UL,  1629UL, 214UL,  436UL,  1952UL, 2009UL, 1494UL, 1526UL, 436UL,  1924UL, 818UL,  1601UL, 990UL,
+    1402UL, 1137UL, 893UL,  828UL,  511UL,  2021UL, 1150UL, 2006UL, 1104UL, 1355UL, 821UL,  565UL,  57UL,   562UL,
+    1137UL, 1029UL, 701UL,  1381UL, 1657UL, 2004UL, 249UL,  1329UL, 1167UL, 273UL,  1428UL, 1118UL, 1075UL, 1918UL,
+    608UL,  908UL,  1809UL, 1638UL, 201UL,  527UL,  355UL,  731UL,  558UL,  1012UL, 1315UL, 528UL,  1538UL, 581UL,
+    1553UL, 435UL,  295UL,  1407UL, 950UL,  805UL,  94UL,   1800UL, 1993UL, 1292UL, 515UL,  744UL,  942UL,  1509UL,
+    1397UL, 1857UL, 361UL,  1007UL, 1027UL, 1591UL, 1950UL, 1339UL, 1134UL, 966UL,  1765UL, 1047UL, 16UL,   1226UL,
+    1318UL, 748UL,  653UL,  1674UL, 1225UL, 1730UL, 1109UL, 195UL,  301UL,  100UL,  1667UL, 1343UL, 917UL,  354UL,
+    1062UL, 1221UL, 584UL,  1934UL, 1563UL, 1844UL, 1702UL, 1458UL, 1405UL, 1710UL, 45UL,   1076UL, 1894UL, 1966UL,
+    742UL,  168UL,  18UL,   2012UL, 12UL,   843UL,  383UL,  150UL,  1799UL, 886UL,  1962UL, 1669UL, 1503UL, 1681UL,
+    1307UL, 1382UL, 234UL,  502UL,  1914UL, 1389UL, 1394UL, 1794UL, 1075UL, 763UL,  38UL,   1261UL, 1498UL, 839UL,
+    471UL,  311UL,  575UL,  791UL,  1419UL, 2033UL, 1525UL, 470UL,  281UL,  191UL,  1577UL, 1022UL, 1656UL, 172UL,
+    833UL,  1789UL, 1413UL, 1142UL, 803UL,  1696UL, 1112UL, 409UL,  1762UL, 1965UL, 141UL,  830UL,  622UL,  287UL,
+    183UL,  1014UL, 108UL,  1766UL, 1100UL, 280UL,  1039UL, 707UL,  1997UL, 1070UL, 17UL,   662UL,  509UL,  212UL,
+    1388UL, 1776UL, 1150UL, 330UL,  1231UL, 1533UL, 60UL,   1307UL, 503UL,  318UL,  1655UL, 832UL,  772UL,  1145UL,
+    1395UL, 1611UL, 1283UL, 546UL,  998UL,  1633UL, 195UL,  2018UL, 1957UL, 565UL,  1339UL, 1763UL, 196UL,  1425UL,
+    1411UL, 800UL,  2000UL, 652UL,  143UL,  397UL,  1151UL, 1873UL, 1569UL, 92UL,   1281UL, 862UL,  1372UL, 770UL,
+    850UL,  311UL,  1958UL, 1282UL, 577UL,  5UL,    2007UL, 1905UL, 939UL,  1732UL, 332UL,  217UL,  131UL,  795UL,
+    1762UL, 17UL,   2032UL, 226UL,  114UL,  319UL,  459UL,  2012UL, 1367UL, 1633UL, 139UL,  1831UL, 511UL,  589UL,
+    501UL,  1185UL, 1228UL, 1690UL, 536UL,  1512UL, 893UL,  676UL,  814UL,  595UL,  1590UL, 1141UL, 1425UL, 717UL,
+    1218UL, 1888UL, 1870UL, 1560UL, 945UL,  1800UL, 1664UL, 1455UL, 346UL,  370UL,  1696UL, 440UL,  351UL,  1600UL,
+    718UL,  222UL,  1865UL, 1597UL, 708UL,  1433UL, 708UL,  1398UL, 1641UL, 161UL,  1144UL, 1965UL, 17UL,   1842UL,
+    1465UL, 1244UL, 137UL,  1820UL, 259UL,  1168UL, 704UL,  195UL,  1960UL, 643UL,  1363UL, 1578UL, 602UL,  1041UL,
+    365UL,  66UL,   1566UL, 29UL,   1105UL, 656UL,  2047UL, 242UL,  343UL,  1070UL, 305UL,  1233UL, 1299UL, 538UL,
+    363UL,  2043UL, 813UL,  919UL,  1039UL, 1558UL, 108UL,  804UL,  1156UL, 1694UL, 994UL,  1405UL, 254UL,  1343UL,
+    1842UL, 503UL,  987UL,  317UL,  494UL,  39UL,   573UL,  222UL,  1889UL, 1741UL, 659UL,  1391UL, 827UL,  1258UL,
+    2040UL, 1810UL, 178UL,  1892UL, 1168UL, 124UL,  198UL,  2UL,    656UL,  1313UL, 493UL,  514UL,  541UL,  418UL,
+    924UL,  276UL,  601UL,  811UL,  1174UL, 1995UL, 837UL,  1765UL, 736UL,  1970UL, 196UL,  1472UL, 230UL,  680UL,
+    1019UL, 957UL,  1229UL, 577UL,  1336UL, 1605UL, 225UL,  2039UL, 1590UL, 1387UL, 126UL,  1467UL, 1345UL, 1591UL,
+    1214UL, 2025UL, 926UL,  1322UL, 1805UL, 1376UL, 1741UL, 636UL,  1033UL, 820UL,  1116UL, 1349UL, 1979UL, 1772UL,
+    1749UL, 980UL,
+};
+uint32_t rand_arr_12_b12_w32_arr[1024] = {
+    1002UL, 997UL,  640UL,  694UL,  2632UL, 1879UL, 1834UL, 1941UL, 1499UL, 2825UL, 3551UL, 4066UL, 8UL,    1625UL,
+    659UL,  3244UL, 4088UL, 2173UL, 2796UL, 3483UL, 3239UL, 2424UL, 2852UL, 3653UL, 3427UL, 648UL,  3865UL, 1220UL,
+    413UL,  2815UL, 1182UL, 3801UL, 3587UL, 3989UL, 483UL,  3798UL, 3235UL, 3976UL, 3819UL, 3577UL, 1507UL, 3337UL,
+    3744UL, 1867UL, 2441UL, 869UL,  1342UL, 991UL,  3519UL, 1359UL, 1845UL, 2048UL, 55UL,   2130UL, 1704UL, 3333UL,
+    1256UL, 3490UL, 3305UL, 2866UL, 3775UL, 3279UL, 1154UL, 3667UL, 3637UL, 4042UL, 2038UL, 781UL,  2434UL, 542UL,
+    2368UL, 2162UL, 1496UL, 3911UL, 3728UL, 388UL,  2218UL, 2669UL, 3853UL, 179UL,  1504UL, 3403UL, 92UL,   3679UL,
+    1128UL, 1630UL, 2333UL, 3760UL, 1671UL, 1257UL, 3871UL, 551UL,  143UL,  3329UL, 3718UL, 3978UL, 2892UL, 2905UL,
+    3500UL, 1239UL, 2416UL, 2880UL, 1665UL, 1338UL, 3939UL, 903UL,  3295UL, 1526UL, 3868UL, 313UL,  3447UL, 2905UL,
+    693UL,  2545UL, 103UL,  403UL,  1357UL, 695UL,  3917UL, 2515UL, 3487UL, 1584UL, 599UL,  2671UL, 3227UL, 1304UL,
+    500UL,  2057UL, 2603UL, 1746UL, 233UL,  1716UL, 2319UL, 3179UL, 1778UL, 1361UL, 2533UL, 676UL,  541UL,  3005UL,
+    1407UL, 3168UL, 3997UL, 2033UL, 1495UL, 2695UL, 2762UL, 527UL,  1062UL, 2830UL, 2683UL, 289UL,  3009UL, 1765UL,
+    619UL,  1159UL, 513UL,  1873UL, 3611UL, 1423UL, 623UL,  1575UL, 1503UL, 56UL,   3381UL, 2785UL, 3032UL, 3179UL,
+    4018UL, 3243UL, 747UL,  3907UL, 1461UL, 2379UL, 1008UL, 1019UL, 1612UL, 3369UL, 2713UL, 2200UL, 1371UL, 3250UL,
+    2899UL, 208UL,  2328UL, 3129UL, 1674UL, 1015UL, 1278UL, 1458UL, 1282UL, 627UL,  1201UL, 908UL,  555UL,  2224UL,
+    2078UL, 2843UL, 1025UL, 3392UL, 3746UL, 247UL,  3850UL, 284UL,  80UL,   1308UL, 3314UL, 657UL,  305UL,  3649UL,
+    252UL,  1677UL, 2698UL, 3400UL, 1042UL, 2081UL, 242UL,  2250UL, 3081UL, 131UL,  370UL,  3041UL, 3909UL, 2273UL,
+    446UL,  2565UL, 3233UL, 3166UL, 1619UL, 2132UL, 2423UL, 4011UL, 3466UL, 3840UL, 814UL,  3492UL, 50UL,   2547UL,
+    2096UL, 2191UL, 2872UL, 1482UL, 3320UL, 3895UL, 1921UL, 2586UL, 2516UL, 2572UL, 719UL,  2058UL, 2950UL, 714UL,
+    1048UL, 773UL,  2284UL, 15UL,   2887UL, 2217UL, 3453UL, 2852UL, 3175UL, 2070UL, 2167UL, 3828UL, 3039UL, 3452UL,
+    1833UL, 1294UL, 2049UL, 97UL,   716UL,  989UL,  1147UL, 1891UL, 2069UL, 988UL,  2922UL, 1578UL, 2845UL, 580UL,
+    948UL,  2918UL, 2785UL, 384UL,  2954UL, 2927UL, 3369UL, 309UL,  1625UL, 706UL,  2891UL, 230UL,  3466UL, 1649UL,
+    3773UL, 3160UL, 2754UL, 3907UL, 3410UL, 3939UL, 4080UL, 1831UL, 818UL,  2788UL, 1839UL, 3901UL, 3811UL, 2354UL,
+    2834UL, 3965UL, 3607UL, 798UL,  3625UL, 705UL,  2130UL, 1859UL, 795UL,  1201UL, 3663UL, 169UL,  1946UL, 1271UL,
+    3791UL, 1578UL, 3177UL, 2746UL, 3395UL, 1347UL, 3674UL, 1697UL, 3547UL, 3863UL, 95UL,   576UL,  2075UL, 2206UL,
+    478UL,  3321UL, 1526UL, 3528UL, 2043UL, 3451UL, 3101UL, 1210UL, 960UL,  530UL,  3896UL, 2585UL, 2623UL, 567UL,
+    1548UL, 150UL,  3132UL, 2797UL, 1619UL, 3217UL, 712UL,  2867UL, 4021UL, 2945UL, 2839UL, 335UL,  2033UL, 694UL,
+    3919UL, 3841UL, 2780UL, 1372UL, 611UL,  3267UL, 573UL,  1658UL, 1449UL, 2857UL, 2022UL, 549UL,  203UL,  340UL,
+    2806UL, 2400UL, 3286UL, 231UL,  376UL,  2987UL, 3291UL, 2063UL, 118UL,  3158UL, 3892UL, 1067UL, 1487UL, 3713UL,
+    566UL,  164UL,  1255UL, 1104UL, 2323UL, 2374UL, 3739UL, 1032UL, 1154UL, 2853UL, 3716UL, 2602UL, 2194UL, 323UL,
+    3903UL, 3443UL, 2836UL, 1286UL, 3255UL, 385UL,  4059UL, 1397UL, 2208UL, 1813UL, 1968UL, 2379UL, 2834UL, 3051UL,
+    1205UL, 1585UL, 2730UL, 1679UL, 1741UL, 703UL,  1640UL, 2054UL, 3599UL, 3869UL, 1901UL, 546UL,  2637UL, 1746UL,
+    2662UL, 2354UL, 1181UL, 785UL,  338UL,  1858UL, 1673UL, 1822UL, 1159UL, 685UL,  3481UL, 4061UL, 2308UL, 845UL,
+    3991UL, 3643UL, 300UL,  1509UL, 2283UL, 3897UL, 2960UL, 2180UL, 3609UL, 2262UL, 3807UL, 1175UL, 448UL,  3656UL,
+    422UL,  2110UL, 2957UL, 3479UL, 3255UL, 1575UL, 1219UL, 2925UL, 590UL,  2889UL, 2525UL, 3662UL, 873UL,  3258UL,
+    1127UL, 3798UL, 3723UL, 2757UL, 241UL,  2801UL, 2331UL, 2944UL, 86UL,   3582UL, 29UL,   260UL,  1247UL, 1139UL,
+    3793UL, 2252UL, 2066UL, 1914UL, 1657UL, 3347UL, 1311UL, 3010UL, 2979UL, 3540UL, 3154UL, 2657UL, 3761UL, 4009UL,
+    2071UL, 210UL,  2780UL, 2134UL, 1947UL, 2375UL, 2863UL, 1978UL, 306UL,  4014UL, 327UL,  1593UL, 737UL,  3842UL,
+    465UL,  106UL,  3401UL, 667UL,  1035UL, 3629UL, 1796UL, 2062UL, 3726UL, 137UL,  2748UL, 1934UL, 2039UL, 3437UL,
+    334UL,  1387UL, 2752UL, 3225UL, 1339UL, 3830UL, 2523UL, 4027UL, 2706UL, 2172UL, 1566UL, 740UL,  637UL,  1937UL,
+    1961UL, 3900UL, 1435UL, 3259UL, 2940UL, 4073UL, 3319UL, 1405UL, 1281UL, 2566UL, 1404UL, 3049UL, 3242UL, 1772UL,
+    3048UL, 3014UL, 2249UL, 3389UL, 3026UL, 1754UL, 764UL,  2012UL, 1942UL, 1440UL, 282UL,  756UL,  1347UL, 1891UL,
+    904UL,  1161UL, 1993UL, 1056UL, 3355UL, 386UL,  2350UL, 2452UL, 817UL,  2583UL, 1992UL, 204UL,  3768UL, 3359UL,
+    2648UL, 3125UL, 1936UL, 1255UL, 1233UL, 2002UL, 3643UL, 1780UL, 3248UL, 2475UL, 3830UL, 2664UL, 1916UL, 2280UL,
+    3326UL, 3729UL, 2802UL, 2618UL, 748UL,  3811UL, 23UL,   1623UL, 3950UL, 655UL,  3619UL, 520UL,  4013UL, 2792UL,
+    3008UL, 1588UL, 2349UL, 3895UL, 1981UL, 4014UL, 1258UL, 657UL,  86UL,   2558UL, 2925UL, 3585UL, 3996UL, 3456UL,
+    334UL,  2150UL, 3406UL, 3905UL, 267UL,  4033UL, 609UL,  2280UL, 3014UL, 3506UL, 3348UL, 1702UL, 2057UL, 3034UL,
+    1426UL, 3745UL, 3906UL, 1263UL, 3226UL, 3908UL, 3986UL, 3907UL, 197UL,  1798UL, 1676UL, 1582UL, 988UL,  2551UL,
+    3939UL, 2704UL, 3739UL, 2452UL, 111UL,  1539UL, 3802UL, 2754UL, 2434UL, 2492UL, 2648UL, 2570UL, 1701UL, 2911UL,
+    2207UL, 2132UL, 2899UL, 2471UL, 3471UL, 2336UL, 2703UL, 555UL,  1503UL, 1699UL, 3939UL, 694UL,  653UL,  512UL,
+    3964UL, 3218UL, 817UL,  497UL,  2147UL, 3778UL, 3674UL, 2047UL, 3163UL, 2741UL, 1967UL, 117UL,  3958UL, 2727UL,
+    3583UL, 2205UL, 439UL,  2178UL, 362UL,  3296UL, 1304UL, 788UL,  1973UL, 2589UL, 1539UL, 3373UL, 1410UL, 1765UL,
+    2399UL, 86UL,   3476UL, 1600UL, 544UL,  382UL,  3157UL, 3897UL, 1834UL, 3087UL, 1084UL, 491UL,  2955UL, 3549UL,
+    944UL,  1480UL, 3748UL, 902UL,  2586UL, 1476UL, 779UL,  584UL,  1568UL, 559UL,  1745UL, 3690UL, 2832UL, 1390UL,
+    2348UL, 1932UL, 3051UL, 2196UL, 1793UL, 3671UL, 2031UL, 3568UL, 3491UL, 2548UL, 3720UL, 2334UL, 477UL,  2567UL,
+    2225UL, 453UL,  1880UL, 1238UL, 2127UL, 2057UL, 1649UL, 995UL,  2216UL, 3332UL, 2328UL, 3286UL, 3849UL, 3366UL,
+    390UL,  3630UL, 2261UL, 2699UL, 3482UL, 837UL,  2697UL, 1873UL, 3466UL, 2019UL, 3074UL, 945UL,  892UL,  1928UL,
+    3294UL, 1169UL, 2768UL, 3414UL, 1918UL, 948UL,  2074UL, 741UL,  1291UL, 989UL,  571UL,  3901UL, 3308UL, 4054UL,
+    3364UL, 2722UL, 2766UL, 1583UL, 545UL,  361UL,  2427UL, 3625UL, 1114UL, 3929UL, 3114UL, 3985UL, 971UL,  2488UL,
+    3319UL, 371UL,  2720UL, 2271UL, 2701UL, 2022UL, 826UL,  2434UL, 792UL,  2173UL, 1964UL, 805UL,  2287UL, 3028UL,
+    2178UL, 148UL,  1822UL, 3596UL, 3225UL, 3981UL, 280UL,  148UL,  334UL,  1108UL, 450UL,  731UL,  4037UL, 3590UL,
+    3614UL, 2388UL, 573UL,  3884UL, 3847UL, 1899UL, 1859UL, 173UL,  3711UL, 3641UL, 220UL,  3379UL, 2383UL, 1654UL,
+    1217UL, 3622UL, 4055UL, 1381UL, 1154UL, 814UL,  3758UL, 3454UL, 2099UL, 1489UL, 1406UL, 2745UL, 2455UL, 3568UL,
+    2125UL, 3702UL, 2800UL, 1175UL, 60UL,   2960UL, 1338UL, 3196UL, 977UL,  3823UL, 3648UL, 4014UL, 1388UL, 1508UL,
+    590UL,  2819UL, 2629UL, 524UL,  3525UL, 1113UL, 1531UL, 1410UL, 2849UL, 629UL,  3031UL, 536UL,  415UL,  3329UL,
+    2623UL, 692UL,  1615UL, 2625UL, 3132UL, 2120UL, 2202UL, 780UL,  1880UL, 1713UL, 3189UL, 1396UL, 3146UL, 2745UL,
+    2303UL, 4089UL, 2745UL, 3094UL, 28UL,   3331UL, 3550UL, 2946UL, 985UL,  4027UL, 2372UL, 1047UL, 3387UL, 394UL,
+    2201UL, 803UL,  1896UL, 4083UL, 1722UL, 3955UL, 461UL,  325UL,  2775UL, 3637UL, 2712UL, 2982UL, 1750UL, 517UL,
+    2221UL, 1677UL, 107UL,  2512UL, 2707UL, 3358UL, 1407UL, 1712UL, 4047UL, 4045UL, 1136UL, 479UL,  3647UL, 1375UL,
+    1584UL, 32UL,   3531UL, 318UL,  1254UL, 3104UL, 741UL,  3225UL, 2UL,    826UL,  53UL,   3550UL, 3463UL, 2058UL,
+    1515UL, 3438UL, 2041UL, 80UL,   2603UL, 1000UL, 600UL,  2682UL, 2190UL, 3559UL, 3833UL, 691UL,  192UL,  662UL,
+    2754UL, 1613UL, 1904UL, 3586UL, 2819UL, 3646UL, 238UL,  4028UL, 3921UL, 1565UL, 1873UL, 2311UL, 2220UL, 3364UL,
+    3374UL, 3594UL, 2339UL, 3060UL, 381UL,  1377UL, 1894UL, 1704UL, 2826UL, 3395UL, 3384UL, 1905UL, 4059UL, 1620UL,
+    1500UL, 2261UL, 3696UL, 3520UL, 1928UL, 2708UL, 947UL,  1974UL, 92UL,   756UL,  2898UL, 3413UL, 1130UL, 777UL,
+    166UL,  253UL,
+};
+uint32_t rand_arr_13_b13_w32_arr[1024] = {
+    4062UL, 1302UL, 473UL,  4312UL, 2947UL, 559UL,  1526UL, 6839UL, 2576UL, 6033UL, 2810UL, 6624UL, 6583UL, 7589UL,
+    0UL,    3323UL, 2183UL, 5721UL, 3210UL, 1722UL, 4169UL, 1717UL, 5172UL, 4337UL, 5898UL, 4748UL, 6341UL, 7939UL,
+    5706UL, 4766UL, 6300UL, 1709UL, 3290UL, 274UL,  4390UL, 4294UL, 8099UL, 4436UL, 8002UL, 6685UL, 130UL,  6814UL,
+    8021UL, 4419UL, 7811UL, 5781UL, 7631UL, 3620UL, 1229UL, 1009UL, 4136UL, 378UL,  2012UL, 6353UL, 559UL,  5859UL,
+    4738UL, 7776UL, 3886UL, 6134UL, 2287UL, 6498UL, 700UL,  7434UL, 3624UL, 6006UL, 5958UL, 420UL,  2299UL, 6568UL,
+    4112UL, 5356UL, 7341UL, 2147UL, 3927UL, 6574UL, 4006UL, 4450UL, 1487UL, 612UL,  8077UL, 2859UL, 1492UL, 5506UL,
+    3398UL, 5393UL, 797UL,  4755UL, 5106UL, 217UL,  3019UL, 4688UL, 3701UL, 1504UL, 6492UL, 4699UL, 2140UL, 4025UL,
+    7015UL, 34UL,   6571UL, 4337UL, 3899UL, 2330UL, 5675UL, 1804UL, 8146UL, 405UL,  3400UL, 5767UL, 7615UL, 868UL,
+    4133UL, 7969UL, 589UL,  6249UL, 3448UL, 4460UL, 3672UL, 2907UL, 5355UL, 5283UL, 2492UL, 4273UL, 3103UL, 6550UL,
+    1257UL, 3969UL, 6258UL, 7963UL, 7850UL, 807UL,  1848UL, 7303UL, 7649UL, 7406UL, 2778UL, 7629UL, 4445UL, 2838UL,
+    2282UL, 3082UL, 4795UL, 4009UL, 6537UL, 4575UL, 3593UL, 5484UL, 3035UL, 5501UL, 6038UL, 6848UL, 7838UL, 3455UL,
+    360UL,  6502UL, 7675UL, 4108UL, 2144UL, 1555UL, 7936UL, 728UL,  2469UL, 651UL,  4133UL, 5330UL, 564UL,  4406UL,
+    7312UL, 5780UL, 9UL,    318UL,  4890UL, 4328UL, 1336UL, 4126UL, 7691UL, 4895UL, 6972UL, 1515UL, 5002UL, 814UL,
+    6254UL, 2961UL, 4307UL, 7750UL, 3903UL, 7464UL, 5898UL, 1530UL, 448UL,  1630UL, 1608UL, 1775UL, 1353UL, 7885UL,
+    1695UL, 2856UL, 6564UL, 4102UL, 1659UL, 3587UL, 1478UL, 1324UL, 4958UL, 2674UL, 8090UL, 5950UL, 6353UL, 711UL,
+    1278UL, 7433UL, 3152UL, 3302UL, 3207UL, 2824UL, 7633UL, 6957UL, 4465UL, 4992UL, 8067UL, 4872UL, 4845UL, 6747UL,
+    5087UL, 6888UL, 1236UL, 2307UL, 6690UL, 3193UL, 6548UL, 6567UL, 2892UL, 770UL,  4722UL, 3475UL, 6267UL, 5496UL,
+    1594UL, 7421UL, 5682UL, 4267UL, 963UL,  106UL,  4894UL, 4178UL, 3796UL, 5110UL, 8100UL, 4127UL, 172UL,  4329UL,
+    1308UL, 5410UL, 7584UL, 5713UL, 5870UL, 6134UL, 6447UL, 2356UL, 2313UL, 1705UL, 2616UL, 7577UL, 690UL,  2155UL,
+    4921UL, 824UL,  5280UL, 8081UL, 685UL,  8091UL, 32UL,   6901UL, 86UL,   754UL,  5606UL, 7570UL, 6386UL, 2904UL,
+    1547UL, 2168UL, 6829UL, 6717UL, 4499UL, 6719UL, 7878UL, 2338UL, 6677UL, 2683UL, 5762UL, 4794UL, 5081UL, 835UL,
+    5808UL, 1239UL, 7722UL, 3311UL, 1833UL, 1817UL, 7458UL, 6389UL, 36UL,   5512UL, 3923UL, 5444UL, 6970UL, 5464UL,
+    2322UL, 4620UL, 820UL,  6687UL, 3222UL, 2641UL, 3203UL, 7039UL, 6484UL, 4475UL, 2344UL, 334UL,  6711UL, 2234UL,
+    5700UL, 7263UL, 6842UL, 3333UL, 7364UL, 4629UL, 5608UL, 4767UL, 6529UL, 6022UL, 6445UL, 663UL,  4360UL, 3991UL,
+    7899UL, 3821UL, 3534UL, 4017UL, 5500UL, 5361UL, 1454UL, 2619UL, 1464UL, 5229UL, 2246UL, 4755UL, 4607UL, 5007UL,
+    5258UL, 4624UL, 218UL,  6062UL, 4640UL, 7012UL, 2456UL, 1764UL, 1428UL, 2938UL, 3544UL, 3922UL, 3604UL, 47UL,
+    2039UL, 3789UL, 2722UL, 5752UL, 6048UL, 171UL,  4461UL, 2422UL, 4480UL, 6897UL, 5276UL, 8127UL, 904UL,  1096UL,
+    3421UL, 7468UL, 262UL,  2873UL, 6513UL, 277UL,  3007UL, 2637UL, 897UL,  4163UL, 8068UL, 3696UL, 1401UL, 3212UL,
+    4978UL, 4382UL, 3269UL, 6896UL, 7254UL, 5893UL, 1791UL, 5095UL, 5361UL, 3611UL, 3894UL, 3621UL, 1802UL, 3991UL,
+    1217UL, 1982UL, 2237UL, 1605UL, 4211UL, 6859UL, 7740UL, 3678UL, 2521UL, 5501UL, 6872UL, 7895UL, 7101UL, 2781UL,
+    1920UL, 6408UL, 1930UL, 810UL,  7399UL, 474UL,  3087UL, 99UL,   7557UL, 141UL,  4591UL, 7947UL, 8150UL, 2088UL,
+    6984UL, 1278UL, 5586UL, 3486UL, 2384UL, 6268UL, 1772UL, 5025UL, 1496UL, 5652UL, 1247UL, 1001UL, 7519UL, 362UL,
+    7079UL, 8128UL, 5842UL, 468UL,  308UL,  5460UL, 1490UL, 5037UL, 438UL,  2347UL, 3567UL, 4186UL, 6712UL, 3544UL,
+    6929UL, 6834UL, 2317UL, 1665UL, 5686UL, 7515UL, 560UL,  1314UL, 3534UL, 2855UL, 2514UL, 2998UL, 6486UL, 2932UL,
+    410UL,  1552UL, 1464UL, 270UL,  2850UL, 849UL,  1262UL, 2131UL, 2252UL, 6952UL, 568UL,  3229UL, 639UL,  3235UL,
+    3813UL, 766UL,  6310UL, 7585UL, 6932UL, 117UL,  1643UL, 1972UL, 6459UL, 7636UL, 707UL,  904UL,  329UL,  7426UL,
+    4731UL, 1799UL, 2478UL, 5642UL, 801UL,  6644UL, 4293UL, 3008UL, 1551UL, 3232UL, 1039UL, 63UL,   7445UL, 1531UL,
+    3219UL, 8005UL, 4890UL, 303UL,  386UL,  4680UL, 6968UL, 3981UL, 6162UL, 5490UL, 3151UL, 7475UL, 5560UL, 5629UL,
+    5226UL, 2142UL, 236UL,  2566UL, 3988UL, 1345UL, 7644UL, 6735UL, 1942UL, 7682UL, 2711UL, 2994UL, 4044UL, 6767UL,
+    5652UL, 1983UL, 6973UL, 98UL,   8165UL, 3582UL, 5744UL, 3725UL, 7349UL, 3123UL, 5426UL, 3874UL, 4941UL, 4278UL,
+    5615UL, 3170UL, 5934UL, 2695UL, 1903UL, 2195UL, 3323UL, 5293UL, 4356UL, 3307UL, 3390UL, 1834UL, 7085UL, 4804UL,
+    4889UL, 2289UL, 7442UL, 6657UL, 2350UL, 727UL,  5033UL, 5261UL, 1661UL, 7150UL, 311UL,  7322UL, 552UL,  4744UL,
+    3454UL, 820UL,  2875UL, 5449UL, 1681UL, 1591UL, 8001UL, 7554UL, 3289UL, 5623UL, 7100UL, 4161UL, 2687UL, 4343UL,
+    4058UL, 1280UL, 8051UL, 754UL,  6199UL, 4144UL, 2146UL, 681UL,  2829UL, 3566UL, 2358UL, 4883UL, 7534UL, 6692UL,
+    111UL,  5698UL, 5293UL, 6920UL, 1783UL, 4137UL, 4861UL, 1385UL, 5874UL, 6895UL, 6305UL, 3511UL, 4644UL, 7791UL,
+    7102UL, 5716UL, 4945UL, 3245UL, 2953UL, 3371UL, 3091UL, 5317UL, 2918UL, 7726UL, 6425UL, 7995UL, 3589UL, 723UL,
+    4664UL, 6995UL, 7270UL, 7808UL, 7646UL, 7384UL, 4709UL, 3140UL, 7450UL, 495UL,  5780UL, 954UL,  4561UL, 3146UL,
+    745UL,  4125UL, 7792UL, 1774UL, 7160UL, 2886UL, 3456UL, 6180UL, 2109UL, 1189UL, 6460UL, 6510UL, 4668UL, 1649UL,
+    1742UL, 7931UL, 1287UL, 140UL,  6040UL, 7307UL, 2464UL, 3538UL, 2679UL, 1814UL, 7335UL, 7064UL, 6710UL, 5801UL,
+    4208UL, 2628UL, 7000UL, 1626UL, 3287UL, 7400UL, 3673UL, 4492UL, 7891UL, 4031UL, 7359UL, 3806UL, 2264UL, 2676UL,
+    1304UL, 3353UL, 4456UL, 7042UL, 5486UL, 1702UL, 6795UL, 4104UL, 4838UL, 4471UL, 4658UL, 3363UL, 3307UL, 900UL,
+    1836UL, 6190UL, 4956UL, 2821UL, 5108UL, 176UL,  6313UL, 1159UL, 1482UL, 2554UL, 1872UL, 2445UL, 5514UL, 839UL,
+    196UL,  8084UL, 70UL,   1596UL, 5733UL, 5598UL, 4965UL, 10UL,   7942UL, 8044UL, 6648UL, 1680UL, 5127UL, 3915UL,
+    3031UL, 8011UL, 2905UL, 3316UL, 8051UL, 3555UL, 5992UL, 5326UL, 6289UL, 7192UL, 3430UL, 5537UL, 462UL,  1252UL,
+    6790UL, 4130UL, 7345UL, 1552UL, 7317UL, 267UL,  972UL,  7205UL, 4300UL, 1024UL, 3809UL, 5012UL, 6075UL, 5516UL,
+    6666UL, 2466UL, 6728UL, 1662UL, 7645UL, 6798UL, 2895UL, 7721UL, 4865UL, 5261UL, 5987UL, 7608UL, 4149UL, 4725UL,
+    2433UL, 7364UL, 1200UL, 5693UL, 3209UL, 4167UL, 3334UL, 314UL,  2073UL, 3188UL, 7605UL, 7964UL, 4795UL, 5961UL,
+    342UL,  3701UL, 4303UL, 675UL,  3268UL, 1037UL, 2224UL, 4191UL, 1971UL, 1954UL, 5634UL, 2778UL, 1080UL, 3974UL,
+    6808UL, 7736UL, 872UL,  3029UL, 3540UL, 3700UL, 7421UL, 2652UL, 3628UL, 2952UL, 2018UL, 5375UL, 2433UL, 5092UL,
+    4133UL, 356UL,  1570UL, 3223UL, 694UL,  2250UL, 8058UL, 4741UL, 3774UL, 5653UL, 6885UL, 5764UL, 6505UL, 5746UL,
+    5992UL, 6966UL, 4213UL, 2553UL, 1690UL, 7867UL, 5180UL, 7376UL, 925UL,  6613UL, 887UL,  4851UL, 3461UL, 1728UL,
+    7684UL, 6022UL, 3903UL, 4686UL, 2952UL, 2673UL, 2961UL, 4967UL, 7946UL, 6005UL, 5532UL, 5712UL, 2510UL, 7671UL,
+    1245UL, 4195UL, 5708UL, 3241UL, 506UL,  2505UL, 916UL,  470UL,  7140UL, 3147UL, 4613UL, 1302UL, 5781UL, 6228UL,
+    6312UL, 3652UL, 2785UL, 7525UL, 5190UL, 6279UL, 2885UL, 5957UL, 7854UL, 5211UL, 3055UL, 254UL,  5268UL, 1215UL,
+    7635UL, 6467UL, 7259UL, 408UL,  2631UL, 5246UL, 4435UL, 3999UL, 4975UL, 3596UL, 5701UL, 5173UL, 5875UL, 4815UL,
+    881UL,  1006UL, 632UL,  5978UL, 6302UL, 2729UL, 4560UL, 790UL,  232UL,  2172UL, 2213UL, 626UL,  5829UL, 3878UL,
+    3104UL, 5578UL, 3632UL, 3886UL, 7893UL, 7873UL, 7191UL, 6731UL, 4609UL, 1228UL, 847UL,  4612UL, 3328UL, 7633UL,
+    7828UL, 7224UL, 936UL,  8058UL, 6326UL, 1777UL, 1167UL, 2563UL, 5049UL, 8172UL, 2232UL, 6494UL, 7761UL, 2682UL,
+    8132UL, 3847UL, 2422UL, 5827UL, 414UL,  7656UL, 8142UL, 3759UL, 1750UL, 636UL,  974UL,  3618UL, 927UL,  7451UL,
+    6559UL, 347UL,  3220UL, 6232UL, 1986UL, 475UL,  2210UL, 3631UL, 4884UL, 5140UL, 4503UL, 5295UL, 5243UL, 320UL,
+    4772UL, 7663UL, 393UL,  326UL,  2176UL, 3006UL, 7496UL, 4557UL, 2629UL, 4368UL, 2793UL, 4908UL, 5492UL, 2018UL,
+    879UL,  7396UL, 186UL,  7347UL, 1701UL, 6410UL, 4812UL, 7206UL, 3933UL, 3370UL, 4574UL, 1635UL, 3958UL, 7039UL,
+    7612UL, 3311UL, 2024UL, 2452UL, 4806UL, 334UL,  3483UL, 4748UL, 5385UL, 1056UL, 7405UL, 4852UL, 6911UL, 4327UL,
+    1705UL, 375UL,
+};
+uint32_t rand_arr_14_b14_w32_arr[1024] = {
+    3785UL,  3480UL,  4534UL,  10761UL, 4380UL,  12375UL, 1927UL,  15671UL, 16345UL, 14322UL, 2493UL,  8577UL,  6098UL,
+    11190UL, 13468UL, 13885UL, 12342UL, 480UL,   10161UL, 8002UL,  1263UL,  9410UL,  217UL,   12410UL, 8277UL,  4926UL,
+    9962UL,  1641UL,  15056UL, 14717UL, 6556UL,  7574UL,  7656UL,  9551UL,  12688UL, 4490UL,  1176UL,  15954UL, 11942UL,
+    9175UL,  2047UL,  2181UL,  2063UL,  3730UL,  712UL,   10531UL, 3909UL,  7594UL,  7537UL,  4723UL,  13796UL, 13671UL,
+    2385UL,  12530UL, 16140UL, 2421UL,  13346UL, 14405UL, 819UL,   2026UL,  12514UL, 12539UL, 11629UL, 13834UL, 4905UL,
+    7972UL,  13264UL, 10652UL, 495UL,   570UL,   14630UL, 8099UL,  13714UL, 10654UL, 2664UL,  7332UL,  11449UL, 7963UL,
+    4717UL,  9212UL,  15033UL, 12740UL, 12653UL, 6850UL,  13466UL, 14538UL, 14936UL, 15088UL, 13693UL, 12447UL, 16051UL,
+    8967UL,  14233UL, 6584UL,  8336UL,  11258UL, 12388UL, 12740UL, 13572UL, 6458UL,  8409UL,  9056UL,  7716UL,  3507UL,
+    13911UL, 3411UL,  15831UL, 3958UL,  14575UL, 12789UL, 14972UL, 966UL,   5002UL,  15596UL, 293UL,   2705UL,  3675UL,
+    2683UL,  15447UL, 197UL,   3706UL,  2064UL,  11649UL, 7734UL,  14246UL, 11405UL, 9967UL,  11127UL, 14600UL, 1170UL,
+    9474UL,  4918UL,  11584UL, 14008UL, 3189UL,  10440UL, 13865UL, 13461UL, 13455UL, 15578UL, 7903UL,  10480UL, 3197UL,
+    1227UL,  2354UL,  13303UL, 7777UL,  4278UL,  3759UL,  5347UL,  7246UL,  7771UL,  3528UL,  10512UL, 1705UL,  4263UL,
+    8447UL,  10989UL, 5837UL,  14167UL, 334UL,   12416UL, 6616UL,  13632UL, 255UL,   1688UL,  5811UL,  8549UL,  14826UL,
+    11976UL, 724UL,   4287UL,  13910UL, 3538UL,  10688UL, 3833UL,  1321UL,  14757UL, 10901UL, 4703UL,  12008UL, 4357UL,
+    2698UL,  7828UL,  15579UL, 4453UL,  16164UL, 4426UL,  5141UL,  9481UL,  7020UL,  13689UL, 11800UL, 4518UL,  6579UL,
+    12492UL, 1550UL,  915UL,   2634UL,  13188UL, 7241UL,  161UL,   11117UL, 11196UL, 14039UL, 15158UL, 5627UL,  4473UL,
+    3105UL,  7659UL,  1665UL,  16278UL, 7937UL,  15155UL, 4680UL,  4492UL,  2809UL,  82UL,    1878UL,  8836UL,  5547UL,
+    5321UL,  11741UL, 327UL,   14549UL, 3699UL,  6358UL,  7565UL,  15937UL, 1450UL,  7630UL,  3653UL,  7530UL,  3660UL,
+    7150UL,  4233UL,  11815UL, 10136UL, 2888UL,  7632UL,  13145UL, 10906UL, 3822UL,  12717UL, 4845UL,  7215UL,  7323UL,
+    7169UL,  10608UL, 8865UL,  12000UL, 13898UL, 1926UL,  10665UL, 8217UL,  6882UL,  2315UL,  9882UL,  12320UL, 14582UL,
+    7328UL,  13549UL, 15340UL, 4474UL,  5765UL,  14278UL, 9065UL,  3785UL,  6241UL,  2894UL,  8719UL,  2522UL,  4569UL,
+    9588UL,  3272UL,  8343UL,  5998UL,  13589UL, 2173UL,  5990UL,  13567UL, 10182UL, 14120UL, 1457UL,  6844UL,  1792UL,
+    9485UL,  11512UL, 16228UL, 2669UL,  14011UL, 10488UL, 15673UL, 1662UL,  15140UL, 10347UL, 12646UL, 2235UL,  2509UL,
+    14898UL, 15509UL, 4055UL,  14137UL, 12375UL, 7134UL,  5229UL,  6095UL,  9861UL,  1235UL,  645UL,   7561UL,  7983UL,
+    6192UL,  3163UL,  10655UL, 9932UL,  9542UL,  16313UL, 1511UL,  6861UL,  6151UL,  1579UL,  3170UL,  5632UL,  14889UL,
+    6447UL,  13138UL, 11461UL, 6814UL,  9777UL,  15254UL, 11441UL, 11700UL, 9410UL,  4579UL,  11920UL, 9690UL,  980UL,
+    15988UL, 6121UL,  1299UL,  15077UL, 12340UL, 16163UL, 13179UL, 2089UL,  2981UL,  7748UL,  1794UL,  5954UL,  5860UL,
+    14116UL, 2817UL,  15594UL, 13218UL, 14460UL, 9709UL,  2127UL,  8571UL,  7833UL,  1707UL,  10724UL, 12039UL, 70UL,
+    3457UL,  1666UL,  15588UL, 14527UL, 8389UL,  4680UL,  16344UL, 11772UL, 8323UL,  5362UL,  15633UL, 11682UL, 8661UL,
+    16081UL, 2397UL,  1716UL,  13975UL, 7004UL,  12441UL, 5005UL,  15545UL, 4109UL,  15617UL, 13657UL, 5838UL,  14072UL,
+    16103UL, 9088UL,  7852UL,  14073UL, 2242UL,  15326UL, 237UL,   1917UL,  11516UL, 5036UL,  15070UL, 6835UL,  8962UL,
+    9817UL,  15765UL, 13966UL, 13624UL, 5861UL,  2684UL,  11887UL, 11557UL, 11155UL, 5364UL,  12389UL, 4334UL,  3439UL,
+    12806UL, 13723UL, 8429UL,  11170UL, 9337UL,  8792UL,  3846UL,  6525UL,  10726UL, 7121UL,  16184UL, 10824UL, 4303UL,
+    4454UL,  7234UL,  10646UL, 5863UL,  4316UL,  7984UL,  13255UL, 10327UL, 9063UL,  15667UL, 4396UL,  643UL,   6158UL,
+    3394UL,  10948UL, 12909UL, 1590UL,  5477UL,  9449UL,  11397UL, 6203UL,  2632UL,  4708UL,  7592UL,  10281UL, 928UL,
+    13617UL, 4010UL,  5299UL,  13945UL, 15825UL, 3581UL,  13320UL, 16380UL, 4117UL,  1864UL,  7125UL,  7096UL,  14183UL,
+    288UL,   9432UL,  5717UL,  3394UL,  8367UL,  10858UL, 3411UL,  4064UL,  3848UL,  2841UL,  5274UL,  5551UL,  15374UL,
+    12431UL, 11970UL, 9250UL,  1046UL,  14215UL, 13196UL, 13865UL, 16228UL, 1964UL,  4428UL,  2530UL,  7812UL,  1039UL,
+    1115UL,  4238UL,  11163UL, 4501UL,  1765UL,  7075UL,  14998UL, 12166UL, 8639UL,  6929UL,  10224UL, 12623UL, 10238UL,
+    965UL,   1648UL,  9977UL,  10288UL, 11232UL, 3161UL,  5811UL,  4997UL,  15174UL, 12721UL, 10861UL, 14400UL, 7899UL,
+    1943UL,  7814UL,  14911UL, 8737UL,  5032UL,  50UL,    15985UL, 3553UL,  12026UL, 5030UL,  1948UL,  7508UL,  2013UL,
+    10699UL, 7408UL,  13540UL, 7648UL,  38UL,    6650UL,  5371UL,  14582UL, 2447UL,  14837UL, 382UL,   11537UL, 10750UL,
+    16002UL, 15065UL, 15266UL, 68UL,    14022UL, 6042UL,  476UL,   5429UL,  13846UL, 6471UL,  4678UL,  7722UL,  2768UL,
+    13826UL, 11836UL, 4207UL,  4942UL,  11664UL, 2399UL,  1981UL,  11595UL, 14434UL, 2968UL,  12765UL, 8160UL,  8487UL,
+    240UL,   9171UL,  1062UL,  1165UL,  10379UL, 8870UL,  11760UL, 8451UL,  15556UL, 7079UL,  423UL,   14367UL, 10326UL,
+    8499UL,  9581UL,  12424UL, 6753UL,  1423UL,  5102UL,  10557UL, 7081UL,  8377UL,  410UL,   6774UL,  4934UL,  15471UL,
+    6379UL,  13463UL, 3799UL,  14408UL, 336UL,   2926UL,  16352UL, 1126UL,  8743UL,  2080UL,  6415UL,  1945UL,  12092UL,
+    13072UL, 86UL,    16097UL, 16346UL, 200UL,   15896UL, 11821UL, 5408UL,  1154UL,  15199UL, 310UL,   15240UL, 9224UL,
+    6035UL,  12213UL, 2332UL,  6017UL,  5633UL,  6781UL,  16181UL, 2440UL,  4893UL,  11466UL, 7012UL,  6370UL,  16219UL,
+    1818UL,  8852UL,  75UL,    12745UL, 5687UL,  5408UL,  15181UL, 5026UL,  951UL,   742UL,   13976UL, 4754UL,  4881UL,
+    10380UL, 13288UL, 10261UL, 365UL,   14573UL, 114UL,   6076UL,  16088UL, 5180UL,  1626UL,  11544UL, 886UL,   12372UL,
+    6241UL,  9733UL,  4059UL,  13553UL, 4968UL,  7862UL,  5607UL,  7154UL,  11644UL, 3487UL,  8313UL,  12902UL, 11294UL,
+    11616UL, 8311UL,  2462UL,  10162UL, 5154UL,  7454UL,  16275UL, 11774UL, 9724UL,  1377UL,  11253UL, 7247UL,  6381UL,
+    11687UL, 11624UL, 12260UL, 16308UL, 16295UL, 10128UL, 12138UL, 15260UL, 8239UL,  7184UL,  7062UL,  6099UL,  12982UL,
+    13425UL, 12614UL, 9205UL,  2388UL,  11941UL, 11563UL, 14778UL, 5123UL,  8441UL,  16014UL, 8162UL,  5889UL,  13577UL,
+    5783UL,  836UL,   1646UL,  14312UL, 16001UL, 10399UL, 14062UL, 15489UL, 1451UL,  8344UL,  9066UL,  11823UL, 9181UL,
+    6909UL,  16012UL, 11126UL, 15880UL, 7391UL,  3297UL,  15771UL, 10751UL, 4797UL,  4023UL,  1452UL,  12347UL, 9735UL,
+    6647UL,  5727UL,  13884UL, 14914UL, 4152UL,  4544UL,  14572UL, 15363UL, 9316UL,  9289UL,  1650UL,  2863UL,  5425UL,
+    14440UL, 7868UL,  5468UL,  8817UL,  4581UL,  7435UL,  14857UL, 4999UL,  14882UL, 15575UL, 9243UL,  1639UL,  11551UL,
+    2918UL,  2717UL,  15638UL, 13042UL, 14139UL, 11133UL, 14217UL, 2473UL,  1573UL,  641UL,   12237UL, 395UL,   3305UL,
+    12075UL, 2510UL,  14652UL, 4178UL,  10915UL, 8257UL,  13293UL, 10408UL, 12430UL, 886UL,   11274UL, 2682UL,  5231UL,
+    16173UL, 15634UL, 6988UL,  2680UL,  11487UL, 129UL,   6155UL,  280UL,   12696UL, 11335UL, 14911UL, 3858UL,  11464UL,
+    9476UL,  12773UL, 4324UL,  15251UL, 5709UL,  509UL,   6710UL,  14793UL, 10046UL, 15919UL, 1343UL,  8483UL,  16186UL,
+    296UL,   1294UL,  5223UL,  11105UL, 14146UL, 1983UL,  652UL,   60UL,    7964UL,  12998UL, 13435UL, 13383UL, 7439UL,
+    10041UL, 14494UL, 3393UL,  12055UL, 11428UL, 8902UL,  4758UL,  9513UL,  12056UL, 3427UL,  4787UL,  7849UL,  15456UL,
+    8338UL,  5367UL,  12446UL, 4846UL,  14638UL, 9421UL,  15328UL, 9517UL,  11258UL, 7119UL,  2903UL,  9317UL,  4440UL,
+    5353UL,  12366UL, 9158UL,  11825UL, 9346UL,  10835UL, 10616UL, 14564UL, 11550UL, 15725UL, 15950UL, 2765UL,  11129UL,
+    3954UL,  11278UL, 8836UL,  4019UL,  2527UL,  13700UL, 12324UL, 15724UL, 10337UL, 561UL,   11922UL, 15849UL, 9697UL,
+    12006UL, 6803UL,  8843UL,  2819UL,  7746UL,  6468UL,  1103UL,  3171UL,  8416UL,  2345UL,  9205UL,  6695UL,  8146UL,
+    7741UL,  6923UL,  484UL,   3696UL,  1714UL,  10245UL, 6390UL,  2029UL,  9028UL,  4684UL,  2119UL,  15080UL, 13809UL,
+    9882UL,  11128UL, 16127UL, 872UL,   14950UL, 788UL,   747UL,   13095UL, 6603UL,  5309UL,  3079UL,  15592UL, 12455UL,
+    3946UL,  3853UL,  2909UL,  16004UL, 12158UL, 10746UL, 13226UL, 5985UL,  2827UL,  9109UL,  7347UL,  999UL,   2541UL,
+    2234UL,  14302UL, 9316UL,  3114UL,  10206UL, 14439UL, 4679UL,  5495UL,  5983UL,  1203UL,  3874UL,  888UL,   14838UL,
+    10080UL, 8800UL,  14310UL, 5039UL,  12901UL, 15049UL, 13581UL, 1198UL,  3565UL,  8472UL,  11680UL, 13320UL, 14749UL,
+    11110UL, 10344UL, 1321UL,  242UL,   10127UL, 1654UL,  1279UL,  6865UL,  4541UL,  4359UL,  10650UL, 13310UL, 10912UL,
+    11311UL, 7094UL,  10932UL, 12266UL, 13800UL, 9528UL,  3843UL,  557UL,   10943UL, 1558UL,  7159UL,  7659UL,  1511UL,
+    12058UL, 10533UL, 3151UL,  7352UL,  17UL,    5996UL,  1987UL,  2751UL,  1557UL,  14947UL, 4910UL,  14728UL, 12185UL,
+    14712UL, 12230UL, 13432UL, 11464UL, 10622UL, 2245UL,  13802UL, 1856UL,  8224UL,  7196UL,  14806UL, 3013UL,  5318UL,
+    8346UL,  980UL,   16112UL, 9810UL,  2026UL,  3619UL,  2080UL,  12790UL, 445UL,   12233UL,
+};
+uint32_t rand_arr_15_b15_w32_arr[1024] = {
+    29603UL, 4215UL,  20567UL, 31500UL, 18624UL, 32584UL, 5828UL,  4454UL,  11633UL, 4895UL,  20294UL, 16323UL, 21208UL,
+    28279UL, 26414UL, 6845UL,  27740UL, 29220UL, 22929UL, 28501UL, 17292UL, 615UL,   25080UL, 26157UL, 26122UL, 30393UL,
+    11796UL, 23011UL, 19918UL, 11457UL, 6716UL,  16172UL, 1683UL,  10957UL, 31725UL, 14656UL, 22950UL, 954UL,   29604UL,
+    1826UL,  9723UL,  18765UL, 26771UL, 28267UL, 17348UL, 21400UL, 27717UL, 16465UL, 16378UL, 19855UL, 27936UL, 29819UL,
+    13803UL, 24846UL, 31474UL, 11647UL, 22070UL, 2189UL,  17617UL, 32219UL, 3415UL,  27502UL, 255UL,   14831UL, 29175UL,
+    31659UL, 23770UL, 27809UL, 15180UL, 23292UL, 16268UL, 352UL,   3353UL,  11359UL, 1729UL,  25167UL, 5864UL,  16738UL,
+    17817UL, 1491UL,  21972UL, 30928UL, 21246UL, 19254UL, 5723UL,  19376UL, 6769UL,  4041UL,  31880UL, 9291UL,  28880UL,
+    5387UL,  15694UL, 18391UL, 13208UL, 29582UL, 9488UL,  21470UL, 4011UL,  6865UL,  1379UL,  10389UL, 22087UL, 15670UL,
+    3140UL,  31967UL, 9114UL,  31480UL, 5953UL,  8553UL,  2323UL,  26525UL, 14587UL, 1538UL,  31082UL, 14752UL, 8470UL,
+    15240UL, 17655UL, 28842UL, 10378UL, 10209UL, 16321UL, 1808UL,  17566UL, 1102UL,  21102UL, 11705UL, 6492UL,  25913UL,
+    10858UL, 17569UL, 25392UL, 29051UL, 6510UL,  6485UL,  9221UL,  3778UL,  30313UL, 15274UL, 21771UL, 10687UL, 20630UL,
+    14274UL, 23488UL, 21879UL, 5148UL,  1050UL,  21817UL, 13856UL, 17450UL, 25231UL, 16838UL, 2039UL,  15126UL, 17800UL,
+    29894UL, 20288UL, 23068UL, 8290UL,  7244UL,  15137UL, 18305UL, 14748UL, 22769UL, 16714UL, 7284UL,  26366UL, 18014UL,
+    23460UL, 8439UL,  19964UL, 32141UL, 27991UL, 13588UL, 23487UL, 21003UL, 8067UL,  28380UL, 2228UL,  13402UL, 6669UL,
+    22472UL, 16912UL, 14989UL, 10258UL, 13990UL, 5922UL,  9167UL,  16992UL, 1375UL,  10UL,    4922UL,  25510UL, 22088UL,
+    8167UL,  6372UL,  15672UL, 28651UL, 8903UL,  8158UL,  4710UL,  13758UL, 7741UL,  26097UL, 4674UL,  24611UL, 20023UL,
+    16272UL, 6306UL,  7936UL,  1346UL,  24896UL, 26360UL, 6805UL,  9973UL,  8890UL,  27635UL, 4920UL,  11453UL, 10782UL,
+    2347UL,  17206UL, 30406UL, 12450UL, 30922UL, 16590UL, 16490UL, 25058UL, 22101UL, 2937UL,  27524UL, 24392UL, 6166UL,
+    18211UL, 15856UL, 28050UL, 22649UL, 31973UL, 29583UL, 14269UL, 30237UL, 18235UL, 25540UL, 30078UL, 7882UL,  15292UL,
+    17771UL, 27652UL, 14500UL, 15692UL, 9307UL,  11404UL, 32553UL, 30310UL, 6486UL,  16103UL, 30235UL, 25678UL, 7714UL,
+    32011UL, 27324UL, 27197UL, 17420UL, 25607UL, 2140UL,  202UL,   16620UL, 381UL,   22858UL, 13643UL, 20415UL, 14780UL,
+    30370UL, 19620UL, 13880UL, 5261UL,  29880UL, 32330UL, 11422UL, 28356UL, 492UL,   25324UL, 1409UL,  30242UL, 15452UL,
+    5920UL,  16844UL, 4342UL,  26522UL, 4042UL,  20037UL, 7907UL,  25726UL, 2451UL,  24281UL, 21547UL, 27060UL, 13099UL,
+    9254UL,  27651UL, 4494UL,  13675UL, 29834UL, 8426UL,  8723UL,  7953UL,  28170UL, 14628UL, 5488UL,  20132UL, 28985UL,
+    25988UL, 8375UL,  6822UL,  1723UL,  12051UL, 28031UL, 7538UL,  7702UL,  21208UL, 6268UL,  27092UL, 15105UL, 22134UL,
+    16923UL, 16117UL, 5000UL,  10885UL, 1403UL,  20264UL, 11653UL, 7855UL,  5844UL,  25896UL, 19195UL, 7882UL,  11407UL,
+    27488UL, 30213UL, 18855UL, 3668UL,  16524UL, 20300UL, 30529UL, 7685UL,  6155UL,  20958UL, 14992UL, 593UL,   18059UL,
+    31931UL, 8001UL,  18019UL, 12165UL, 16625UL, 18166UL, 30776UL, 11436UL, 2670UL,  10702UL, 22930UL, 17446UL, 13810UL,
+    16972UL, 26400UL, 3652UL,  5774UL,  27170UL, 27109UL, 20568UL, 16341UL, 2719UL,  26428UL, 29373UL, 9317UL,  11164UL,
+    16115UL, 30690UL, 5785UL,  21131UL, 14675UL, 30312UL, 25912UL, 31682UL, 14952UL, 21151UL, 12335UL, 6551UL,  28637UL,
+    14761UL, 3181UL,  8928UL,  12123UL, 27543UL, 18196UL, 23097UL, 19652UL, 17603UL, 9372UL,  19301UL, 5588UL,  16429UL,
+    26949UL, 31218UL, 32511UL, 7746UL,  12308UL, 5434UL,  3550UL,  32648UL, 24391UL, 31465UL, 10125UL, 14637UL, 6832UL,
+    3841UL,  26651UL, 32099UL, 7166UL,  19110UL, 26915UL, 5477UL,  15264UL, 23285UL, 7187UL,  20769UL, 10914UL, 12235UL,
+    15689UL, 7047UL,  23148UL, 9838UL,  5974UL,  24406UL, 31232UL, 32434UL, 3352UL,  28456UL, 30494UL, 18083UL, 2939UL,
+    17966UL, 24861UL, 5034UL,  18711UL, 1082UL,  30555UL, 10245UL, 19201UL, 27034UL, 13766UL, 14613UL, 19136UL, 4542UL,
+    20023UL, 21638UL, 8335UL,  31906UL, 31411UL, 30428UL, 32480UL, 19702UL, 11172UL, 6548UL,  19405UL, 22448UL, 30479UL,
+    1831UL,  23171UL, 30336UL, 12287UL, 23245UL, 18810UL, 32354UL, 6251UL,  20449UL, 17103UL, 4719UL,  29710UL, 20921UL,
+    31705UL, 63UL,    30097UL, 8551UL,  14533UL, 24368UL, 4302UL,  4955UL,  22275UL, 21498UL, 23885UL, 14902UL, 24056UL,
+    3306UL,  2648UL,  542UL,   27461UL, 22124UL, 6859UL,  6454UL,  14502UL, 6496UL,  31996UL, 27131UL, 10706UL, 6766UL,
+    25035UL, 3919UL,  27760UL, 21234UL, 3979UL,  7363UL,  30814UL, 32123UL, 5825UL,  32506UL, 7816UL,  19060UL, 12871UL,
+    17000UL, 19446UL, 8448UL,  1231UL,  16314UL, 7384UL,  2188UL,  23883UL, 32080UL, 27582UL, 31540UL, 5274UL,  1639UL,
+    8123UL,  81UL,    1713UL,  6407UL,  3915UL,  2885UL,  32129UL, 13626UL, 2731UL,  28921UL, 25961UL, 20214UL, 8079UL,
+    10943UL, 31474UL, 7537UL,  14929UL, 30714UL, 22039UL, 665UL,   3815UL,  18105UL, 22360UL, 25498UL, 14416UL, 18404UL,
+    17045UL, 23267UL, 18681UL, 26778UL, 29540UL, 4627UL,  14507UL, 18814UL, 3929UL,  1245UL,  16970UL, 30204UL, 31968UL,
+    20101UL, 9285UL,  5740UL,  10014UL, 8301UL,  4179UL,  16364UL, 559UL,   3301UL,  15637UL, 15616UL, 9830UL,  13658UL,
+    9299UL,  17635UL, 1554UL,  26754UL, 7667UL,  2735UL,  31162UL, 16491UL, 17040UL, 4497UL,  29824UL, 22228UL, 6180UL,
+    16626UL, 30586UL, 378UL,   22456UL, 12040UL, 3934UL,  31156UL, 13924UL, 31611UL, 29627UL, 12257UL, 22598UL, 4585UL,
+    29528UL, 16932UL, 11402UL, 21622UL, 24349UL, 5471UL,  2245UL,  30932UL, 30577UL, 25887UL, 29360UL, 15397UL, 17749UL,
+    483UL,   4288UL,  19278UL, 20099UL, 21844UL, 13362UL, 4852UL,  25935UL, 18860UL, 22192UL, 727UL,   30114UL, 5678UL,
+    20016UL, 22840UL, 31617UL, 7847UL,  19393UL, 1569UL,  7565UL,  29847UL, 25072UL, 28769UL, 15457UL, 26385UL, 13519UL,
+    3886UL,  27094UL, 28471UL, 1223UL,  27237UL, 4900UL,  21436UL, 15330UL, 26328UL, 1165UL,  10540UL, 8559UL,  9117UL,
+    21243UL, 21271UL, 21316UL, 6309UL,  16802UL, 3769UL,  8003UL,  28867UL, 25776UL, 24919UL, 27072UL, 15982UL, 27821UL,
+    6548UL,  6633UL,  22197UL, 22972UL, 4943UL,  31012UL, 8021UL,  9394UL,  1213UL,  14153UL, 15439UL, 9665UL,  4951UL,
+    11002UL, 31533UL, 17429UL, 6604UL,  29802UL, 2325UL,  27423UL, 11879UL, 31530UL, 26890UL, 27402UL, 14100UL, 2483UL,
+    4976UL,  14864UL, 19604UL, 31702UL, 15420UL, 15030UL, 7456UL,  17912UL, 13659UL, 25157UL, 26755UL, 5357UL,  30451UL,
+    27579UL, 25081UL, 18966UL, 12487UL, 28193UL, 11474UL, 18169UL, 27885UL, 2673UL,  9169UL,  27082UL, 6129UL,  15764UL,
+    21894UL, 18439UL, 25948UL, 4535UL,  30783UL, 27733UL, 31105UL, 6002UL,  10135UL, 17187UL, 10558UL, 28907UL, 14201UL,
+    10916UL, 19074UL, 8287UL,  25582UL, 23292UL, 16590UL, 18202UL, 12424UL, 17343UL, 10066UL, 15995UL, 6397UL,  18201UL,
+    32240UL, 15817UL, 1878UL,  12992UL, 24494UL, 880UL,   22046UL, 18547UL, 24634UL, 5885UL,  20107UL, 15602UL, 12769UL,
+    2072UL,  31017UL, 13633UL, 31045UL, 27299UL, 15326UL, 29860UL, 20117UL, 10538UL, 20867UL, 4810UL,  28782UL, 8631UL,
+    28473UL, 20421UL, 24421UL, 9047UL,  30488UL, 16640UL, 7904UL,  17337UL, 10182UL, 2643UL,  29567UL, 2710UL,  7424UL,
+    21970UL, 4724UL,  15604UL, 4671UL,  5516UL,  3128UL,  526UL,   22612UL, 22070UL, 30938UL, 26598UL, 8155UL,  14145UL,
+    24520UL, 23182UL, 17669UL, 28382UL, 32011UL, 574UL,   4130UL,  24300UL, 27632UL, 8079UL,  1540UL,  32298UL, 25164UL,
+    12235UL, 9340UL,  27930UL, 13701UL, 9700UL,  23585UL, 12883UL, 15319UL, 29853UL, 18590UL, 9110UL,  5178UL,  11798UL,
+    9821UL,  27219UL, 19432UL, 31830UL, 6696UL,  22823UL, 4394UL,  21996UL, 4489UL,  27112UL, 29615UL, 19287UL, 11416UL,
+    21885UL, 5212UL,  349UL,   28277UL, 28388UL, 10333UL, 4518UL,  5043UL,  28958UL, 15026UL, 2416UL,  11059UL, 8069UL,
+    13734UL, 978UL,   8333UL,  30040UL, 25410UL, 26047UL, 9190UL,  5244UL,  11494UL, 15432UL, 18776UL, 3805UL,  7018UL,
+    7006UL,  24678UL, 4959UL,  21954UL, 3487UL,  7176UL,  21092UL, 4947UL,  11677UL, 9660UL,  20413UL, 16413UL, 9095UL,
+    30127UL, 15423UL, 14993UL, 21805UL, 21797UL, 27108UL, 23430UL, 14706UL, 10300UL, 14478UL, 21043UL, 1536UL,  17448UL,
+    17882UL, 6295UL,  2913UL,  26671UL, 4612UL,  22738UL, 18665UL, 26463UL, 31543UL, 6470UL,  28919UL, 7977UL,  28270UL,
+    6434UL,  6304UL,  13059UL, 13115UL, 27796UL, 4503UL,  14446UL, 22066UL, 19021UL, 21938UL, 15075UL, 31224UL, 25990UL,
+    16238UL, 24181UL, 21410UL, 18355UL, 22692UL, 7869UL,  21445UL, 24692UL, 9039UL,  30860UL, 22290UL, 9560UL,  17701UL,
+    18006UL, 649UL,   15124UL, 21255UL, 7730UL,  16790UL, 17839UL, 6488UL,  15610UL, 12453UL, 25901UL, 18935UL, 10783UL,
+    7397UL,  22158UL, 18458UL, 27766UL, 15914UL, 139UL,   23503UL, 3745UL,  26845UL, 29291UL, 1555UL,  20892UL, 29563UL,
+    2309UL,  30026UL, 11690UL, 32174UL, 1529UL,  831UL,   26233UL, 22979UL, 31496UL, 28832UL, 7229UL,  21864UL, 23930UL,
+    12497UL, 32594UL, 21231UL, 3423UL,  24399UL, 26033UL, 6968UL,  27574UL, 6065UL,  31972UL, 31418UL, 5702UL,  32410UL,
+    18520UL, 17694UL, 7765UL,  28432UL, 10615UL, 9825UL,  20648UL, 27702UL, 8309UL,  4228UL,  11745UL, 15681UL, 27782UL,
+    11728UL, 17597UL, 24923UL, 14895UL, 12510UL, 464UL,   16925UL, 4435UL,  15192UL, 14781UL, 12774UL, 12644UL, 8220UL,
+    16153UL, 6596UL,  18040UL, 18874UL, 15750UL, 27253UL, 21940UL, 26906UL, 27508UL, 28797UL,
+};
+uint32_t rand_arr_16_b16_w32_arr[1024] = {
+    16500UL, 23931UL, 523UL,   59040UL, 16994UL, 40541UL, 4075UL,  51617UL, 64543UL, 57649UL, 58279UL, 12487UL, 45370UL,
+    4173UL,  9102UL,  60129UL, 19732UL, 24041UL, 22836UL, 47308UL, 39220UL, 45616UL, 44267UL, 17320UL, 55086UL, 27065UL,
+    29882UL, 50502UL, 37130UL, 45039UL, 60234UL, 10923UL, 40837UL, 5712UL,  44837UL, 7552UL,  42935UL, 7559UL,  2839UL,
+    40270UL, 46998UL, 9600UL,  51463UL, 8811UL,  13885UL, 30802UL, 47538UL, 14381UL, 38509UL, 56826UL, 6120UL,  30883UL,
+    37866UL, 4268UL,  59243UL, 18285UL, 6955UL,  36323UL, 43980UL, 25935UL, 40788UL, 29302UL, 39413UL, 30116UL, 23494UL,
+    2522UL,  24170UL, 58659UL, 6645UL,  6072UL,  10460UL, 45224UL, 5378UL,  9084UL,  42267UL, 13285UL, 28689UL, 57511UL,
+    26035UL, 44546UL, 40167UL, 57889UL, 10486UL, 22220UL, 51898UL, 41975UL, 37930UL, 26004UL, 36855UL, 33276UL, 10724UL,
+    6805UL,  50641UL, 30191UL, 55526UL, 11273UL, 49753UL, 21800UL, 8279UL,  51167UL, 44771UL, 1546UL,  14112UL, 10088UL,
+    50913UL, 30398UL, 36356UL, 54260UL, 23247UL, 42851UL, 30700UL, 2134UL,  56985UL, 49471UL, 63370UL, 21568UL, 5972UL,
+    39171UL, 15264UL, 11445UL, 62866UL, 25769UL, 44220UL, 5692UL,  63357UL, 35158UL, 32368UL, 4912UL,  41968UL, 4584UL,
+    38296UL, 53782UL, 28156UL, 3727UL,  3163UL,  58895UL, 62925UL, 64576UL, 51843UL, 64036UL, 37634UL, 1985UL,  27772UL,
+    57863UL, 19408UL, 24226UL, 23773UL, 50089UL, 52216UL, 316UL,   44937UL, 20546UL, 41052UL, 12768UL, 32969UL, 35062UL,
+    63166UL, 10975UL, 17948UL, 61096UL, 57068UL, 2373UL,  61627UL, 40864UL, 26210UL, 14850UL, 59879UL, 9969UL,  62045UL,
+    9081UL,  17469UL, 23408UL, 41592UL, 15507UL, 41123UL, 43858UL, 5902UL,  40596UL, 49126UL, 42188UL, 46291UL, 56166UL,
+    18443UL, 51592UL, 41707UL, 29514UL, 35856UL, 49267UL, 2051UL,  28921UL, 58786UL, 31799UL, 31844UL, 24957UL, 65073UL,
+    21071UL, 8487UL,  2236UL,  22911UL, 9219UL,  48743UL, 1839UL,  1219UL,  26739UL, 22156UL, 2350UL,  55996UL, 35466UL,
+    56594UL, 64418UL, 41624UL, 64655UL, 62876UL, 41379UL, 27053UL, 17541UL, 49093UL, 35958UL, 47010UL, 18085UL, 42737UL,
+    8916UL,  29122UL, 39317UL, 23625UL, 61988UL, 20295UL, 28362UL, 13652UL, 39856UL, 53254UL, 41901UL, 40069UL, 10679UL,
+    26291UL, 33525UL, 39415UL, 63569UL, 13754UL, 8373UL,  44687UL, 57528UL, 55966UL, 3347UL,  19990UL, 64054UL, 19462UL,
+    24658UL, 15746UL, 13874UL, 6271UL,  23611UL, 42497UL, 4132UL,  57909UL, 18559UL, 15267UL, 2912UL,  56600UL, 47411UL,
+    52450UL, 23975UL, 14052UL, 47647UL, 52582UL, 60931UL, 19596UL, 18041UL, 47558UL, 58596UL, 35260UL, 701UL,   63648UL,
+    62719UL, 33245UL, 2523UL,  30417UL, 1556UL,  63205UL, 44296UL, 54672UL, 48352UL, 32770UL, 57714UL, 59901UL, 2413UL,
+    13687UL, 33676UL, 59140UL, 2460UL,  63278UL, 61563UL, 24361UL, 33291UL, 322UL,   5098UL,  44877UL, 33796UL, 34501UL,
+    12877UL, 18099UL, 36107UL, 55616UL, 5990UL,  37646UL, 45199UL, 60974UL, 4586UL,  50233UL, 59373UL, 20038UL, 30897UL,
+    5216UL,  20825UL, 4144UL,  41900UL, 13328UL, 46495UL, 62265UL, 45587UL, 41863UL, 25534UL, 16264UL, 40551UL, 47396UL,
+    61955UL, 15056UL, 35884UL, 56210UL, 38908UL, 25380UL, 14906UL, 15099UL, 10432UL, 12989UL, 30847UL, 18033UL, 5523UL,
+    57174UL, 8378UL,  37540UL, 10283UL, 736UL,   46138UL, 18548UL, 52336UL, 55909UL, 46167UL, 56695UL, 65354UL, 31786UL,
+    48997UL, 30355UL, 38377UL, 27013UL, 14796UL, 13505UL, 28763UL, 63762UL, 9825UL,  13558UL, 7731UL,  61952UL, 479UL,
+    4822UL,  14215UL, 44574UL, 42275UL, 54234UL, 4642UL,  233UL,   13467UL, 5763UL,  61413UL, 37129UL, 35857UL, 51754UL,
+    7677UL,  26217UL, 43457UL, 7566UL,  60588UL, 3480UL,  60367UL, 30356UL, 34537UL, 38635UL, 35046UL, 22511UL, 47971UL,
+    58143UL, 40951UL, 13640UL, 6621UL,  15435UL, 10871UL, 16571UL, 24745UL, 36588UL, 14376UL, 2536UL,  2266UL,  17367UL,
+    2112UL,  14298UL, 42820UL, 55272UL, 32206UL, 12169UL, 54619UL, 15337UL, 2381UL,  15324UL, 33402UL, 6303UL,  5233UL,
+    17871UL, 11367UL, 24907UL, 53917UL, 32949UL, 4215UL,  8915UL,  10907UL, 52927UL, 24587UL, 5450UL,  27586UL, 64103UL,
+    60647UL, 37797UL, 15197UL, 53878UL, 39239UL, 18390UL, 59874UL, 39498UL, 37693UL, 37155UL, 4347UL,  8582UL,  21990UL,
+    43636UL, 53913UL, 33961UL, 6477UL,  47560UL, 23920UL, 42747UL, 23614UL, 14236UL, 49549UL, 15766UL, 27693UL, 6056UL,
+    6277UL,  35677UL, 35439UL, 28632UL, 18560UL, 57764UL, 59202UL, 20539UL, 50316UL, 1770UL,  35090UL, 10212UL, 5012UL,
+    16367UL, 25557UL, 1638UL,  53140UL, 20859UL, 7654UL,  19960UL, 8361UL,  5385UL,  26639UL, 51437UL, 63478UL, 9564UL,
+    29370UL, 60654UL, 62749UL, 9938UL,  13656UL, 8240UL,  9233UL,  13182UL, 47682UL, 43753UL, 18706UL, 63564UL, 25565UL,
+    56475UL, 7948UL,  1684UL,  48234UL, 28185UL, 10601UL, 53745UL, 6285UL,  64026UL, 26092UL, 28672UL, 42842UL, 27747UL,
+    62359UL, 30525UL, 31902UL, 28681UL, 43811UL, 45150UL, 59343UL, 13941UL, 57251UL, 4138UL,  61980UL, 7934UL,  47732UL,
+    7536UL,  29889UL, 7605UL,  19795UL, 29406UL, 1204UL,  25867UL, 506UL,   13928UL, 63210UL, 40701UL, 23525UL, 52878UL,
+    29507UL, 32615UL, 25218UL, 47399UL, 422UL,   53615UL, 51159UL, 26663UL, 57633UL, 11854UL, 48668UL, 24418UL, 34134UL,
+    22998UL, 4551UL,  35899UL, 25099UL, 20265UL, 8623UL,  31369UL, 15648UL, 60599UL, 4959UL,  11098UL, 46240UL, 49469UL,
+    19811UL, 17083UL, 65245UL, 29049UL, 35831UL, 40639UL, 15675UL, 51338UL, 34219UL, 21820UL, 52296UL, 60863UL, 29375UL,
+    35234UL, 8735UL,  30906UL, 1293UL,  62543UL, 19681UL, 31873UL, 26611UL, 50498UL, 21719UL, 58776UL, 37751UL, 55397UL,
+    10187UL, 39312UL, 36924UL, 57580UL, 6797UL,  8672UL,  51189UL, 49192UL, 22750UL, 21646UL, 6463UL,  53491UL, 45770UL,
+    19032UL, 54855UL, 39307UL, 25879UL, 34890UL, 9576UL,  55485UL, 9069UL,  20768UL, 5919UL,  8727UL,  35399UL, 15891UL,
+    17065UL, 50754UL, 48900UL, 27335UL, 20979UL, 59979UL, 47874UL, 19950UL, 3142UL,  1780UL,  8592UL,  15170UL, 43442UL,
+    54097UL, 25965UL, 21002UL, 6100UL,  11011UL, 12194UL, 2150UL,  3835UL,  15444UL, 4701UL,  53986UL, 49880UL, 25569UL,
+    14557UL, 60025UL, 6162UL,  31330UL, 52240UL, 58713UL, 16107UL, 8889UL,  1206UL,  14930UL, 33541UL, 47209UL, 33160UL,
+    393UL,   19375UL, 36172UL, 43236UL, 10660UL, 25598UL, 65506UL, 13746UL, 41398UL, 57756UL, 23774UL, 62151UL, 20974UL,
+    26636UL, 3852UL,  36047UL, 24159UL, 39828UL, 10306UL, 25339UL, 39406UL, 40538UL, 28504UL, 61848UL, 29954UL, 27437UL,
+    20885UL, 65078UL, 5479UL,  57082UL, 21056UL, 64017UL, 42408UL, 24460UL, 50946UL, 19445UL, 59508UL, 33509UL, 33966UL,
+    33225UL, 14557UL, 49749UL, 58700UL, 29174UL, 23160UL, 62132UL, 33040UL, 40556UL, 10053UL, 51984UL, 46602UL, 5225UL,
+    10738UL, 51389UL, 49252UL, 45399UL, 717UL,   14816UL, 27159UL, 36517UL, 40593UL, 62019UL, 43911UL, 44289UL, 9166UL,
+    45858UL, 44670UL, 1888UL,  13791UL, 18354UL, 25063UL, 43259UL, 14841UL, 12472UL, 30453UL, 259UL,   44938UL, 1264UL,
+    1996UL,  11960UL, 36554UL, 25617UL, 25621UL, 37117UL, 64533UL, 40698UL, 12345UL, 21548UL, 62812UL, 18483UL, 46966UL,
+    16541UL, 33862UL, 62017UL, 61343UL, 54717UL, 18686UL, 58836UL, 53957UL, 9332UL,  54566UL, 11732UL, 893UL,   44649UL,
+    8068UL,  20942UL, 38492UL, 16647UL, 29461UL, 19444UL, 23083UL, 45982UL, 44796UL, 57569UL, 6713UL,  20156UL, 57410UL,
+    55956UL, 12194UL, 19405UL, 46914UL, 43131UL, 28051UL, 62852UL, 44886UL, 39048UL, 9596UL,  30898UL, 21234UL, 2151UL,
+    24584UL, 20502UL, 51886UL, 14856UL, 61801UL, 34620UL, 56297UL, 13221UL, 14111UL, 52117UL, 41649UL, 32927UL, 29339UL,
+    35006UL, 50380UL, 51393UL, 38481UL, 46746UL, 56250UL, 32738UL, 45635UL, 47914UL, 48697UL, 47539UL, 18115UL, 13051UL,
+    61606UL, 458UL,   58993UL, 24229UL, 37024UL, 2484UL,  54435UL, 20956UL, 43874UL, 42858UL, 37717UL, 4054UL,  1539UL,
+    5616UL,  64792UL, 48137UL, 35206UL, 50771UL, 22739UL, 26794UL, 60078UL, 65241UL, 26483UL, 1938UL,  8218UL,  29040UL,
+    63027UL, 7769UL,  11750UL, 28589UL, 33531UL, 48412UL, 46754UL, 33351UL, 43051UL, 63328UL, 64409UL, 59268UL, 44282UL,
+    48793UL, 20518UL, 40485UL, 5909UL,  30875UL, 28810UL, 10730UL, 47184UL, 13499UL, 8539UL,  23841UL, 40984UL, 16548UL,
+    29507UL, 57795UL, 50776UL, 49508UL, 51387UL, 151UL,   47974UL, 65395UL, 30036UL, 31552UL, 26142UL, 8302UL,  45411UL,
+    53549UL, 36784UL, 32427UL, 62296UL, 48394UL, 65353UL, 50939UL, 26519UL, 35909UL, 2590UL,  18168UL, 13565UL, 13176UL,
+    36217UL, 48907UL, 5231UL,  57601UL, 34603UL, 5941UL,  20036UL, 1796UL,  56204UL, 56655UL, 47291UL, 20775UL, 61315UL,
+    1534UL,  49403UL, 31615UL, 61601UL, 32005UL, 23670UL, 22698UL, 52724UL, 643UL,   61959UL, 52415UL, 28789UL, 17527UL,
+    2882UL,  58262UL, 42801UL, 30101UL, 50609UL, 58674UL, 43976UL, 21198UL, 41223UL, 62975UL, 47999UL, 5625UL,  30899UL,
+    30827UL, 52037UL, 18794UL, 65353UL, 42310UL, 60769UL, 21361UL, 16470UL, 24661UL, 31902UL, 42840UL, 13181UL, 18491UL,
+    48506UL, 58354UL, 65252UL, 7366UL,  38316UL, 6790UL,  12593UL, 64937UL, 5946UL,  9419UL,  42516UL, 13882UL, 6894UL,
+    2994UL,  8203UL,  9691UL,  56853UL, 24840UL, 59550UL, 40003UL, 62935UL, 13501UL, 24188UL, 50769UL, 44039UL, 26964UL,
+    42693UL, 33312UL, 33192UL, 12345UL, 3851UL,  8794UL,  37661UL, 31375UL, 63776UL, 62835UL, 35452UL, 7270UL,  57904UL,
+    57022UL, 10632UL, 40001UL, 20296UL, 3989UL,  9138UL,  370UL,   12595UL, 5134UL,  5708UL,  48735UL, 65388UL, 23721UL,
+    13312UL, 53294UL, 1506UL,  61917UL, 62294UL, 43790UL, 46213UL, 37023UL, 11816UL, 9258UL,  30701UL, 37293UL, 50555UL,
+    13003UL, 35410UL, 18940UL, 32847UL, 18616UL, 39384UL, 33618UL, 58390UL, 39523UL, 27453UL, 29956UL, 17281UL, 61034UL,
+    45422UL, 4250UL,  10751UL, 8221UL,  51843UL, 15042UL, 20919UL, 60945UL, 11294UL, 17568UL,
+};
+uint32_t rand_arr_17_b17_w32_arr[1024] = {
+    127563UL, 41769UL,  72636UL,  115998UL, 126893UL, 83387UL,  11UL,     69609UL,  116808UL, 99119UL,  83367UL,
+    88635UL,  36993UL,  31101UL,  77310UL,  8080UL,   64685UL,  102852UL, 80743UL,  2430UL,   889UL,    69158UL,
+    53054UL,  40704UL,  40926UL,  68642UL,  33275UL,  87075UL,  20209UL,  103103UL, 80537UL,  117119UL, 56259UL,
+    115428UL, 31251UL,  60834UL,  7562UL,   57263UL,  75528UL,  117486UL, 31064UL,  21920UL,  116564UL, 96170UL,
+    29752UL,  126505UL, 57724UL,  91172UL,  97950UL,  47785UL,  96608UL,  42247UL,  82186UL,  62308UL,  1398UL,
+    59825UL,  69414UL,  35490UL,  25215UL,  70649UL,  3628UL,   35859UL,  40433UL,  81725UL,  97085UL,  51480UL,
+    48756UL,  105441UL, 34672UL,  42997UL,  78731UL,  25253UL,  104997UL, 39499UL,  17865UL,  83137UL,  125942UL,
+    6885UL,   122325UL, 131051UL, 19396UL,  41596UL,  17994UL,  115586UL, 8596UL,   13686UL,  43095UL,  106795UL,
+    18626UL,  32102UL,  102578UL, 6273UL,   114804UL, 125484UL, 90664UL,  72796UL,  53520UL,  6776UL,   126766UL,
+    84630UL,  44803UL,  58511UL,  62779UL,  101470UL, 73624UL,  64633UL,  2698UL,   94907UL,  98551UL,  24803UL,
+    49340UL,  36795UL,  105981UL, 13411UL,  24272UL,  13255UL,  117211UL, 50850UL,  77116UL,  17123UL,  94154UL,
+    47655UL,  118243UL, 6199UL,   59340UL,  73858UL,  103981UL, 64761UL,  98705UL,  33493UL,  105237UL, 27430UL,
+    11602UL,  61690UL,  63548UL,  12379UL,  56464UL,  49565UL,  52665UL,  14478UL,  41749UL,  87443UL,  96431UL,
+    108378UL, 41799UL,  79659UL,  89593UL,  116402UL, 99975UL,  49535UL,  33106UL,  123625UL, 114223UL, 5157UL,
+    3259UL,   19139UL,  107079UL, 118128UL, 63572UL,  79759UL,  8874UL,   68123UL,  113290UL, 62843UL,  53868UL,
+    6561UL,   8443UL,   98857UL,  12726UL,  62301UL,  69759UL,  9311UL,   33254UL,  107510UL, 111902UL, 119102UL,
+    4646UL,   120993UL, 112762UL, 72243UL,  25282UL,  82537UL,  94836UL,  24837UL,  40700UL,  15832UL,  16866UL,
+    64490UL,  83392UL,  76374UL,  37771UL,  34528UL,  124018UL, 55605UL,  79103UL,  1845UL,   125069UL, 4230UL,
+    54922UL,  73024UL,  46893UL,  40230UL,  45648UL,  3241UL,   69737UL,  98547UL,  3848UL,   45971UL,  41354UL,
+    11344UL,  52660UL,  121553UL, 2262UL,   106044UL, 36765UL,  938UL,    117706UL, 121519UL, 98972UL,  100999UL,
+    18610UL,  19678UL,  45204UL,  116107UL, 109061UL, 54001UL,  119768UL, 7310UL,   43283UL,  55318UL,  102493UL,
+    101329UL, 85816UL,  11581UL,  35556UL,  30424UL,  84153UL,  29833UL,  58472UL,  9466UL,   25096UL,  53580UL,
+    3988UL,   123283UL, 10499UL,  100238UL, 125559UL, 21248UL,  118240UL, 49522UL,  98715UL,  120430UL, 103072UL,
+    113837UL, 7092UL,   109050UL, 28773UL,  20428UL,  122909UL, 130334UL, 58402UL,  290UL,    83147UL,  84979UL,
+    24947UL,  24380UL,  118218UL, 44092UL,  31504UL,  64846UL,  76968UL,  70054UL,  102351UL, 124708UL, 53567UL,
+    92123UL,  14530UL,  4290UL,   86934UL,  118881UL, 107315UL, 51346UL,  113327UL, 84866UL,  28884UL,  130976UL,
+    99700UL,  63725UL,  100300UL, 78298UL,  101031UL, 103804UL, 38660UL,  79990UL,  97498UL,  688UL,    80482UL,
+    22266UL,  113854UL, 112332UL, 64923UL,  60652UL,  57934UL,  125314UL, 89005UL,  90070UL,  52905UL,  112361UL,
+    102117UL, 14463UL,  88870UL,  5523UL,   58507UL,  108904UL, 56876UL,  86516UL,  54573UL,  70797UL,  22318UL,
+    10919UL,  73874UL,  96312UL,  27473UL,  85026UL,  56963UL,  75371UL,  83186UL,  104039UL, 99300UL,  30769UL,
+    88757UL,  123466UL, 103187UL, 97365UL,  111712UL, 14380UL,  61049UL,  101784UL, 95366UL,  50356UL,  75525UL,
+    97981UL,  35869UL,  78838UL,  85646UL,  80410UL,  129399UL, 31358UL,  119192UL, 122335UL, 10047UL,  82101UL,
+    86185UL,  96139UL,  57678UL,  14484UL,  11263UL,  8930UL,   9985UL,   96756UL,  95310UL,  78582UL,  56255UL,
+    119686UL, 126466UL, 119682UL, 75420UL,  120153UL, 55362UL,  29761UL,  46392UL,  58475UL,  39585UL,  73416UL,
+    113151UL, 128732UL, 77557UL,  16337UL,  45751UL,  25703UL,  53669UL,  51999UL,  107784UL, 113277UL, 69603UL,
+    42861UL,  129969UL, 2117UL,   26464UL,  71911UL,  91656UL,  90234UL,  110225UL, 17172UL,  93946UL,  66248UL,
+    89522UL,  98276UL,  83705UL,  20567UL,  105906UL, 8661UL,   50416UL,  7451UL,   95817UL,  129129UL, 39226UL,
+    18694UL,  106315UL, 55733UL,  32640UL,  95731UL,  86770UL,  94277UL,  123129UL, 37652UL,  6828UL,   91252UL,
+    105687UL, 118106UL, 130984UL, 23657UL,  41174UL,  31488UL,  80491UL,  3273UL,   119071UL, 121326UL, 27074UL,
+    28719UL,  14020UL,  87569UL,  115553UL, 55895UL,  87056UL,  9965UL,   39635UL,  32822UL,  25601UL,  127305UL,
+    42557UL,  117764UL, 124602UL, 86443UL,  40325UL,  46233UL,  62062UL,  4798UL,   117300UL, 116883UL, 16657UL,
+    127607UL, 107918UL, 103697UL, 86020UL,  89778UL,  27133UL,  85323UL,  4227UL,   26370UL,  60412UL,  6764UL,
+    74533UL,  130474UL, 82610UL,  33413UL,  56007UL,  107129UL, 44076UL,  49862UL,  112901UL, 125129UL, 60553UL,
+    69294UL,  78252UL,  19454UL,  63037UL,  46426UL,  60119UL,  113883UL, 47707UL,  112963UL, 76261UL,  97704UL,
+    46967UL,  43974UL,  107249UL, 109207UL, 36792UL,  62663UL,  69986UL,  11572UL,  57811UL,  76588UL,  65195UL,
+    42483UL,  68046UL,  43162UL,  56512UL,  130179UL, 120228UL, 129125UL, 108452UL, 78977UL,  87929UL,  86812UL,
+    8303UL,   78517UL,  4467UL,   77899UL,  18304UL,  3885UL,   83463UL,  124661UL, 57832UL,  83606UL,  33924UL,
+    116870UL, 31573UL,  23475UL,  62951UL,  100918UL, 68079UL,  69541UL,  38028UL,  99821UL,  28302UL,  25604UL,
+    84066UL,  10909UL,  72212UL,  89759UL,  67696UL,  43067UL,  35716UL,  45852UL,  47801UL,  110582UL, 38495UL,
+    9790UL,   62106UL,  72177UL,  104410UL, 35556UL,  112476UL, 10666UL,  53868UL,  16577UL,  116376UL, 70852UL,
+    42659UL,  43405UL,  98024UL,  46785UL,  61350UL,  100199UL, 2276UL,   61830UL,  62418UL,  16056UL,  50392UL,
+    57088UL,  51320UL,  45238UL,  68461UL,  107139UL, 14007UL,  121396UL, 88605UL,  57617UL,  128126UL, 52177UL,
+    61808UL,  79675UL,  5376UL,   64686UL,  63149UL,  94596UL,  40394UL,  30225UL,  50659UL,  52348UL,  107508UL,
+    77081UL,  2893UL,   110787UL, 68867UL,  87383UL,  78150UL,  62052UL,  21585UL,  80381UL,  86400UL,  96618UL,
+    60110UL,  38329UL,  835UL,    49669UL,  111456UL, 72806UL,  25441UL,  68366UL,  105001UL, 4791UL,   64943UL,
+    122554UL, 120152UL, 11347UL,  25387UL,  26645UL,  104480UL, 44550UL,  110087UL, 35241UL,  6649UL,   36058UL,
+    113141UL, 46855UL,  119975UL, 110445UL, 31557UL,  76493UL,  32763UL,  66482UL,  121534UL, 110364UL, 6693UL,
+    108103UL, 51112UL,  53242UL,  89105UL,  2717UL,   74294UL,  93454UL,  89619UL,  44009UL,  90910UL,  53985UL,
+    89791UL,  102093UL, 59145UL,  44383UL,  109004UL, 61208UL,  16057UL,  126749UL, 74872UL,  3188UL,   87025UL,
+    79270UL,  89539UL,  28380UL,  129752UL, 4450UL,   100433UL, 58636UL,  18675UL,  39096UL,  100932UL, 14606UL,
+    21114UL,  63567UL,  108337UL, 51156UL,  47537UL,  89946UL,  107230UL, 76740UL,  67299UL,  33909UL,  123631UL,
+    53497UL,  63066UL,  117402UL, 22654UL,  93949UL,  11373UL,  95603UL,  43215UL,  130543UL, 110347UL, 77553UL,
+    109281UL, 96251UL,  105242UL, 37674UL,  104523UL, 79063UL,  77518UL,  78221UL,  83777UL,  44189UL,  124210UL,
+    76214UL,  6070UL,   113882UL, 53613UL,  73561UL,  85402UL,  15749UL,  49268UL,  5306UL,   13180UL,  127914UL,
+    34162UL,  22153UL,  97054UL,  55567UL,  105688UL, 10065UL,  71127UL,  39898UL,  70737UL,  104835UL, 42946UL,
+    101637UL, 27100UL,  2604UL,   68138UL,  23003UL,  12077UL,  15024UL,  43747UL,  34276UL,  66702UL,  50231UL,
+    58601UL,  35669UL,  111442UL, 107687UL, 55714UL,  78058UL,  88754UL,  9336UL,   78949UL,  103892UL, 111690UL,
+    22142UL,  28343UL,  98536UL,  120134UL, 20156UL,  54765UL,  1626UL,   5974UL,   77262UL,  81773UL,  79351UL,
+    79164UL,  12223UL,  39831UL,  1766UL,   120938UL, 34934UL,  67002UL,  60742UL,  38UL,     65579UL,  58281UL,
+    116276UL, 85180UL,  50136UL,  66563UL,  100537UL, 83387UL,  97638UL,  73703UL,  47428UL,  103650UL, 51140UL,
+    129461UL, 10966UL,  69394UL,  39408UL,  86915UL,  29717UL,  55754UL,  83387UL,  67415UL,  54774UL,  99435UL,
+    120874UL, 88279UL,  53519UL,  116919UL, 31214UL,  57889UL,  97705UL,  8327UL,   92677UL,  19882UL,  125086UL,
+    37802UL,  21814UL,  121839UL, 2745UL,   92923UL,  118099UL, 25192UL,  114637UL, 40735UL,  20480UL,  11944UL,
+    61412UL,  113125UL, 107199UL, 52623UL,  45094UL,  9033UL,   86088UL,  111925UL, 62044UL,  89102UL,  105522UL,
+    36704UL,  39737UL,  88803UL,  298UL,    22570UL,  112868UL, 122796UL, 111118UL, 2991UL,   79030UL,  126031UL,
+    50758UL,  120689UL, 114687UL, 96424UL,  44833UL,  101786UL, 129296UL, 46280UL,  98141UL,  25043UL,  69031UL,
+    63633UL,  88051UL,  99287UL,  121202UL, 118684UL, 63596UL,  54875UL,  46621UL,  9719UL,   39156UL,  66127UL,
+    76669UL,  76719UL,  110430UL, 3113UL,   47761UL,  89101UL,  3499UL,   118010UL, 70459UL,  87367UL,  58672UL,
+    88744UL,  98992UL,  59579UL,  116124UL, 83073UL,  5089UL,   120572UL, 89560UL,  20351UL,  71554UL,  116682UL,
+    28812UL,  20156UL,  48735UL,  33915UL,  106527UL, 22115UL,  123318UL, 63628UL,  46455UL,  64015UL,  2534UL,
+    64673UL,  31440UL,  112365UL, 50679UL,  124196UL, 33347UL,  74939UL,  13391UL,  22603UL,  2870UL,   34711UL,
+    72295UL,  128101UL, 24748UL,  87358UL,  34410UL,  59216UL,  80027UL,  93546UL,  120929UL, 15718UL,  98043UL,
+    98533UL,  104768UL, 80001UL,  78898UL,  95019UL,  72970UL,  80867UL,  18841UL,  4301UL,   66896UL,  55342UL,
+    63919UL,  21100UL,  128130UL, 125168UL, 31317UL,  44590UL,  53811UL,  36186UL,  92534UL,  17940UL,  89062UL,
+    10447UL,  100358UL, 18908UL,  83972UL,  72253UL,  38823UL,  128339UL, 59744UL,  93584UL,  99803UL,  94819UL,
+    122095UL, 97707UL,  91623UL,  86999UL,  23539UL,  69871UL,  32492UL,  80828UL,  114961UL, 113354UL, 27671UL,
+    29213UL,  71953UL,  109351UL, 74681UL,  52487UL,  78601UL,  96020UL,  10568UL,  58984UL,  77991UL,  49023UL,
+    82485UL,  72592UL,  91624UL,  9671UL,   124360UL, 53376UL,  37781UL,  130750UL, 11418UL,  126139UL, 21508UL,
+    129871UL, 128279UL, 94439UL,  109667UL, 20406UL,  122718UL, 59411UL,  111014UL, 80847UL,  14733UL,  117951UL,
+    120790UL, 91061UL,  67678UL,  96143UL,  122179UL, 57729UL,  48040UL,  106384UL, 24471UL,  15710UL,  58017UL,
+    101921UL, 7191UL,   28500UL,  48623UL,  116493UL, 124135UL, 50196UL,  46039UL,  66580UL,  63338UL,  56052UL,
+    94018UL,  103053UL, 82144UL,  59639UL,  70722UL,  83683UL,  69281UL,  86538UL,  112949UL, 129974UL, 11169UL,
+    90317UL,  127291UL, 120596UL, 100713UL, 36786UL,  119851UL, 116650UL, 119896UL, 58071UL,  52345UL,  23095UL,
+    25258UL,
+};
+uint32_t rand_arr_18_b18_w32_arr[1024] = {
+    134474UL, 97155UL,  147968UL, 204356UL, 175062UL, 94450UL,  84824UL,  24138UL,  103259UL, 105963UL, 256665UL,
+    164471UL, 87956UL,  205362UL, 21945UL,  166775UL, 164123UL, 23642UL,  23850UL,  243741UL, 86769UL,  85177UL,
+    200154UL, 69867UL,  100939UL, 50762UL,  49770UL,  207710UL, 163603UL, 71005UL,  144985UL, 233276UL, 228868UL,
+    91427UL,  72746UL,  83304UL,  204727UL, 47722UL,  116490UL, 173442UL, 160545UL, 235976UL, 54504UL,  226532UL,
+    127851UL, 131496UL, 168105UL, 105892UL, 208553UL, 146526UL, 150806UL, 116844UL, 21738UL,  262035UL, 229850UL,
+    24929UL,  105711UL, 181363UL, 35040UL,  208028UL, 8888UL,   80850UL,  173656UL, 137311UL, 50471UL,  166590UL,
+    173066UL, 164170UL, 109841UL, 131331UL, 126541UL, 231899UL, 45199UL,  41885UL,  94500UL,  250808UL, 5428UL,
+    163961UL, 251069UL, 29279UL,  254712UL, 150997UL, 148064UL, 212618UL, 194508UL, 226456UL, 139399UL, 191431UL,
+    128107UL, 98207UL,  114689UL, 145078UL, 127446UL, 54577UL,  128815UL, 42900UL,  141395UL, 215900UL, 152457UL,
+    228829UL, 65050UL,  73803UL,  144060UL, 254686UL, 44961UL,  170740UL, 54173UL,  128623UL, 4752UL,   37841UL,
+    8056UL,   47478UL,  84719UL,  28698UL,  89618UL,  220948UL, 55708UL,  212164UL, 152061UL, 158378UL, 110773UL,
+    106541UL, 96804UL,  85835UL,  75570UL,  11647UL,  2288UL,   221336UL, 206925UL, 84134UL,  64468UL,  217591UL,
+    53976UL,  122627UL, 253824UL, 132369UL, 229510UL, 139404UL, 33478UL,  169360UL, 157378UL, 111048UL, 163112UL,
+    192890UL, 242452UL, 2033UL,   111861UL, 210833UL, 125526UL, 201269UL, 185810UL, 117577UL, 242277UL, 261567UL,
+    230879UL, 63705UL,  64061UL,  44739UL,  89863UL,  140485UL, 193976UL, 31071UL,  75102UL,  74860UL,  244677UL,
+    200224UL, 148985UL, 256554UL, 205430UL, 88625UL,  240007UL, 126120UL, 255923UL, 49944UL,  10857UL,  187928UL,
+    24829UL,  55577UL,  94095UL,  223764UL, 75231UL,  42431UL,  101273UL, 118076UL, 94663UL,  93169UL,  192592UL,
+    124562UL, 105955UL, 154249UL, 28537UL,  108947UL, 212356UL, 80925UL,  64730UL,  197445UL, 196833UL, 68201UL,
+    131506UL, 110341UL, 64369UL,  86577UL,  196997UL, 194681UL, 126485UL, 89159UL,  5800UL,   143750UL, 224546UL,
+    155690UL, 195379UL, 206756UL, 143963UL, 256158UL, 67986UL,  32299UL,  16746UL,  126868UL, 25796UL,  73782UL,
+    180892UL, 91387UL,  67267UL,  85016UL,  179933UL, 49684UL,  21125UL,  235584UL, 6476UL,   191882UL, 4609UL,
+    132681UL, 153496UL, 138813UL, 254031UL, 96474UL,  58187UL,  232836UL, 65439UL,  46723UL,  183453UL, 209070UL,
+    48410UL,  217389UL, 76496UL,  20610UL,  107253UL, 84208UL,  257240UL, 103026UL, 189866UL, 76207UL,  209549UL,
+    117864UL, 219239UL, 227586UL, 157996UL, 29082UL,  139973UL, 178129UL, 44542UL,  10604UL,  194776UL, 75607UL,
+    16881UL,  175101UL, 23349UL,  148440UL, 122492UL, 14997UL,  229481UL, 112354UL, 158319UL, 132192UL, 62921UL,
+    85324UL,  148411UL, 160344UL, 31379UL,  130793UL, 24337UL,  160029UL, 46121UL,  2503UL,   257944UL, 241986UL,
+    15613UL,  244547UL, 176872UL, 153434UL, 100840UL, 79847UL,  10489UL,  62338UL,  135452UL, 86478UL,  14934UL,
+    62561UL,  28809UL,  104896UL, 162880UL, 59605UL,  148542UL, 51630UL,  256556UL, 53761UL,  16131UL,  250255UL,
+    115839UL, 75441UL,  216035UL, 195277UL, 123095UL, 34251UL,  120967UL, 257073UL, 110430UL, 8178UL,   55446UL,
+    153432UL, 154950UL, 22350UL,  67624UL,  86817UL,  138714UL, 197617UL, 173508UL, 248359UL, 75317UL,  9369UL,
+    247074UL, 221696UL, 224885UL, 59684UL,  4673UL,   58086UL,  33383UL,  191676UL, 135888UL, 80160UL,  51531UL,
+    36441UL,  84588UL,  216737UL, 61339UL,  83036UL,  188532UL, 242481UL, 8165UL,   200670UL, 170418UL, 111746UL,
+    119709UL, 220498UL, 86927UL,  51524UL,  40044UL,  87449UL,  130059UL, 43152UL,  37898UL,  106777UL, 71227UL,
+    123703UL, 236757UL, 45685UL,  130917UL, 201199UL, 133774UL, 194615UL, 71045UL,  206609UL, 210792UL, 763UL,
+    66780UL,  169407UL, 94248UL,  37041UL,  94733UL,  174201UL, 19701UL,  108432UL, 96276UL,  252167UL, 192407UL,
+    187212UL, 50501UL,  41241UL,  136698UL, 160904UL, 93273UL,  103156UL, 165717UL, 30349UL,  150790UL, 72105UL,
+    963UL,    107223UL, 242562UL, 109815UL, 61818UL,  80045UL,  28914UL,  175446UL, 257841UL, 128682UL, 208561UL,
+    147792UL, 204704UL, 226942UL, 187685UL, 3507UL,   240769UL, 12976UL,  215617UL, 249408UL, 181360UL, 241964UL,
+    173786UL, 179352UL, 126973UL, 62909UL,  124410UL, 120872UL, 178783UL, 144384UL, 110563UL, 90114UL,  9171UL,
+    15702UL,  25187UL,  114030UL, 236823UL, 25311UL,  16257UL,  49527UL,  193822UL, 255019UL, 176480UL, 223166UL,
+    133670UL, 225454UL, 134987UL, 139451UL, 59673UL,  86825UL,  175644UL, 239291UL, 166591UL, 74498UL,  140020UL,
+    170016UL, 85376UL,  236259UL, 161128UL, 224545UL, 237469UL, 88971UL,  61613UL,  96938UL,  23696UL,  171608UL,
+    65506UL,  248232UL, 8249UL,   228703UL, 239215UL, 18797UL,  83530UL,  166552UL, 13371UL,  196920UL, 23184UL,
+    182462UL, 215165UL, 165761UL, 25741UL,  162057UL, 167868UL, 229928UL, 61012UL,  139559UL, 34059UL,  187376UL,
+    58767UL,  163330UL, 205561UL, 163056UL, 233939UL, 71277UL,  87880UL,  14751UL,  174273UL, 115272UL, 86781UL,
+    94300UL,  127193UL, 258235UL, 66179UL,  146897UL, 224661UL, 232393UL, 65342UL,  55255UL,  58473UL,  42851UL,
+    113387UL, 61947UL,  49509UL,  91159UL,  69851UL,  123549UL, 251724UL, 34140UL,  161047UL, 47093UL,  56552UL,
+    80930UL,  119474UL, 78616UL,  195760UL, 64793UL,  176859UL, 76961UL,  144319UL, 123892UL, 126792UL, 8689UL,
+    242665UL, 17460UL,  14690UL,  190100UL, 119145UL, 114709UL, 90398UL,  134245UL, 122163UL, 85182UL,  173110UL,
+    49346UL,  247301UL, 98355UL,  251563UL, 195781UL, 251248UL, 64345UL,  161955UL, 120233UL, 115842UL, 122189UL,
+    186919UL, 41577UL,  197229UL, 170565UL, 189342UL, 177196UL, 123675UL, 82337UL,  114963UL, 105967UL, 18422UL,
+    152188UL, 11329UL,  221949UL, 64292UL,  7596UL,   255099UL, 111271UL, 48061UL,  86111UL,  237506UL, 4824UL,
+    218949UL, 241260UL, 73649UL,  41851UL,  220599UL, 68605UL,  19042UL,  186152UL, 170861UL, 171071UL, 193040UL,
+    209631UL, 184563UL, 184492UL, 68268UL,  177944UL, 160612UL, 171896UL, 20581UL,  191455UL, 180439UL, 47913UL,
+    261056UL, 171648UL, 164941UL, 202239UL, 148726UL, 28313UL,  134051UL, 256509UL, 225076UL, 95664UL,  12216UL,
+    64129UL,  131637UL, 212145UL, 58692UL,  241918UL, 108656UL, 176486UL, 230218UL, 3185UL,   209718UL, 62734UL,
+    138434UL, 184095UL, 161810UL, 222138UL, 137635UL, 215811UL, 184970UL, 134918UL, 158196UL, 155801UL, 4765UL,
+    71888UL,  88205UL,  95903UL,  110441UL, 258233UL, 173380UL, 78304UL,  154051UL, 147049UL, 11070UL,  15074UL,
+    108251UL, 213099UL, 50178UL,  141932UL, 255689UL, 120265UL, 143580UL, 129148UL, 115934UL, 184315UL, 161213UL,
+    19480UL,  44626UL,  55044UL,  245576UL, 201518UL, 189246UL, 137802UL, 60059UL,  70069UL,  193301UL, 178623UL,
+    189831UL, 135223UL, 51452UL,  92013UL,  225223UL, 26359UL,  162301UL, 35393UL,  34643UL,  128824UL, 66108UL,
+    163317UL, 174627UL, 93254UL,  143083UL, 255998UL, 183335UL, 8986UL,   182998UL, 258697UL, 39226UL,  91332UL,
+    59109UL,  204362UL, 2313UL,   7726UL,   131835UL, 118936UL, 248401UL, 105861UL, 166303UL, 238747UL, 13880UL,
+    8427UL,   252226UL, 193054UL, 42122UL,  199752UL, 195738UL, 232467UL, 153299UL, 251597UL, 250898UL, 196942UL,
+    191374UL, 257616UL, 52726UL,  215799UL, 106485UL, 254120UL, 80136UL,  6664UL,   197268UL, 18594UL,  113598UL,
+    114693UL, 196721UL, 151864UL, 164070UL, 66339UL,  141080UL, 176320UL, 256458UL, 54700UL,  159394UL, 215313UL,
+    131449UL, 251544UL, 51859UL,  71105UL,  128601UL, 32710UL,  229947UL, 115219UL, 246354UL, 237901UL, 195673UL,
+    221273UL, 208785UL, 188236UL, 90782UL,  232662UL, 90277UL,  12554UL,  78601UL,  90844UL,  110417UL, 159253UL,
+    57634UL,  177221UL, 100333UL, 165169UL, 132161UL, 112498UL, 17695UL,  255811UL, 182319UL, 224493UL, 173862UL,
+    40753UL,  197330UL, 24665UL,  121360UL, 55238UL,  98079UL,  218283UL, 248849UL, 80009UL,  241929UL, 106844UL,
+    61878UL,  93065UL,  225349UL, 262027UL, 256992UL, 178457UL, 168917UL, 177021UL, 239511UL, 78577UL,  252996UL,
+    252726UL, 147139UL, 114145UL, 207024UL, 28064UL,  154507UL, 183618UL, 167778UL, 150853UL, 226899UL, 49822UL,
+    199945UL, 242001UL, 98826UL,  111673UL, 138689UL, 117214UL, 53523UL,  126528UL, 184732UL, 26227UL,  154931UL,
+    168996UL, 59103UL,  136025UL, 84276UL,  88396UL,  165396UL, 123002UL, 12829UL,  20676UL,  223829UL, 59297UL,
+    192316UL, 183940UL, 85598UL,  117499UL, 259632UL, 177290UL, 213675UL, 234221UL, 96262UL,  116977UL, 128597UL,
+    246856UL, 212883UL, 142796UL, 131027UL, 106381UL, 233858UL, 9114UL,   32053UL,  15332UL,  157442UL, 222646UL,
+    212661UL, 147283UL, 211218UL, 95518UL,  67624UL,  209176UL, 118330UL, 91020UL,  233183UL, 156470UL, 250758UL,
+    222867UL, 126078UL, 187948UL, 135225UL, 129033UL, 88838UL,  122512UL, 231717UL, 117924UL, 223805UL, 50065UL,
+    151714UL, 38183UL,  81953UL,  99537UL,  35699UL,  239469UL, 118303UL, 195992UL, 107029UL, 250781UL, 189976UL,
+    222245UL, 76743UL,  138211UL, 236110UL, 46415UL,  42903UL,  15364UL,  48468UL,  214083UL, 42150UL,  74421UL,
+    45636UL,  90797UL,  41885UL,  244894UL, 166932UL, 120041UL, 59209UL,  198516UL, 63408UL,  242452UL, 135377UL,
+    50663UL,  93579UL,  213688UL, 18258UL,  127196UL, 134947UL, 31484UL,  78886UL,  13335UL,  187227UL, 185733UL,
+    65531UL,  92858UL,  99493UL,  73189UL,  38072UL,  234325UL, 145736UL, 250321UL, 235143UL, 124358UL, 225789UL,
+    181727UL, 98544UL,  6114UL,   237816UL, 197670UL, 198034UL, 119176UL, 34156UL,  6979UL,   21386UL,  15351UL,
+    99699UL,  92464UL,  25321UL,  197856UL, 21558UL,  209964UL, 14614UL,  33870UL,  49143UL,  127486UL, 98332UL,
+    42535UL,  16242UL,  136658UL, 80229UL,  70650UL,  46050UL,  105567UL, 204284UL, 234203UL, 213211UL, 126475UL,
+    83474UL,  87829UL,  68632UL,  191986UL, 63610UL,  200153UL, 140553UL, 149523UL, 32484UL,  106326UL, 43266UL,
+    72304UL,  115616UL, 79749UL,  243887UL, 18613UL,  2739UL,   58332UL,  229930UL, 45849UL,  193925UL, 5843UL,
+    58031UL,  40452UL,  54128UL,  79271UL,  57103UL,  102012UL, 38434UL,  198620UL, 79572UL,  56607UL,  14778UL,
+    65935UL,  216520UL, 136181UL, 72172UL,  248146UL, 179695UL, 173622UL, 47449UL,  176256UL, 61442UL,  255239UL,
+    166916UL, 109605UL, 121747UL, 216771UL, 5274UL,   101428UL, 91186UL,  149964UL, 219520UL, 88676UL,  177597UL,
+    176259UL, 390UL,    44021UL,  27523UL,  104830UL, 23573UL,  49461UL,  211695UL, 47694UL,  221578UL, 112534UL,
+    90642UL,  63721UL,  213594UL, 144343UL, 27807UL,  8284UL,   198022UL, 217942UL, 140060UL, 249810UL, 21326UL,
+    14458UL,
+};
+uint32_t rand_arr_19_b19_w32_arr[1024] = {
+    111559UL, 40571UL,  480022UL, 184996UL, 49467UL,  226478UL, 211529UL, 375896UL, 39834UL,  522579UL, 333128UL,
+    123323UL, 281350UL, 310785UL, 420599UL, 502901UL, 152382UL, 214811UL, 87913UL,  440807UL, 470162UL, 136474UL,
+    355566UL, 486339UL, 50549UL,  72766UL,  498991UL, 410931UL, 410669UL, 447652UL, 32473UL,  354659UL, 470040UL,
+    300519UL, 308547UL, 336978UL, 255994UL, 2921UL,   506594UL, 255247UL, 25654UL,  235813UL, 243412UL, 351071UL,
+    496533UL, 85024UL,  242150UL, 203377UL, 216150UL, 230116UL, 254320UL, 423348UL, 357913UL, 499968UL, 90437UL,
+    45943UL,  89932UL,  183661UL, 129293UL, 505644UL, 94318UL,  450407UL, 385369UL, 36347UL,  49396UL,  329782UL,
+    186612UL, 25352UL,  187150UL, 154722UL, 213192UL, 16534UL,  368342UL, 483491UL, 122948UL, 165378UL, 456372UL,
+    220690UL, 284122UL, 509029UL, 224194UL, 148045UL, 308674UL, 212743UL, 495031UL, 386141UL, 474941UL, 89805UL,
+    246325UL, 308971UL, 183546UL, 24106UL,  218863UL, 155357UL, 222240UL, 315356UL, 156512UL, 183525UL, 334061UL,
+    362262UL, 127457UL, 84657UL,  274361UL, 261255UL, 471717UL, 493362UL, 518040UL, 466458UL, 215617UL, 300722UL,
+    523345UL, 442966UL, 498342UL, 287658UL, 18734UL,  201609UL, 515623UL, 337044UL, 382209UL, 50947UL,  458248UL,
+    100424UL, 322380UL, 95990UL,  392435UL, 137274UL, 386845UL, 316678UL, 274432UL, 512846UL, 184504UL, 483448UL,
+    230584UL, 189420UL, 217813UL, 121158UL, 207625UL, 396152UL, 491704UL, 224709UL, 356202UL, 468116UL, 21432UL,
+    206290UL, 196557UL, 342344UL, 441511UL, 204405UL, 405363UL, 295431UL, 186571UL, 414378UL, 4346UL,   160477UL,
+    234452UL, 290147UL, 455713UL, 500143UL, 126128UL, 435801UL, 168917UL, 5671UL,   43917UL,  301313UL, 281334UL,
+    15330UL,  75165UL,  204764UL, 317072UL, 51008UL,  234044UL, 222972UL, 387466UL, 283224UL, 448317UL, 289973UL,
+    507266UL, 169549UL, 148405UL, 333419UL, 487616UL, 478586UL, 157138UL, 320919UL, 395854UL, 10191UL,  130265UL,
+    209346UL, 147539UL, 411320UL, 476895UL, 248202UL, 475163UL, 27852UL,  111667UL, 470611UL, 125117UL, 184511UL,
+    376172UL, 206040UL, 439735UL, 254665UL, 366776UL, 138160UL, 140244UL, 523106UL, 259716UL, 178226UL, 158127UL,
+    213186UL, 349328UL, 48634UL,  63526UL,  512130UL, 166293UL, 163738UL, 78757UL,  87245UL,  22474UL,  428357UL,
+    15853UL,  369649UL, 416442UL, 53245UL,  490721UL, 86579UL,  133735UL, 152142UL, 312478UL, 338196UL, 457554UL,
+    93485UL,  214612UL, 284793UL, 481017UL, 44756UL,  197409UL, 162806UL, 511355UL, 434993UL, 152665UL, 453012UL,
+    264249UL, 11658UL,  445190UL, 211513UL, 64229UL,  447368UL, 90308UL,  139479UL, 423645UL, 519724UL, 355422UL,
+    349637UL, 413685UL, 353377UL, 30259UL,  269397UL, 285855UL, 366539UL, 126739UL, 200973UL, 308879UL, 138744UL,
+    259056UL, 408056UL, 436841UL, 158133UL, 138061UL, 248482UL, 92599UL,  462010UL, 185055UL, 235049UL, 401291UL,
+    110357UL, 389187UL, 56181UL,  237545UL, 477723UL, 379394UL, 303559UL, 102429UL, 5427UL,   38042UL,  408629UL,
+    351883UL, 397195UL, 373163UL, 189501UL, 366249UL, 486238UL, 37387UL,  186461UL, 384565UL, 468779UL, 482315UL,
+    54885UL,  142027UL, 4951UL,   44242UL,  58155UL,  137342UL, 421518UL, 287766UL, 7417UL,   423218UL, 2778UL,
+    2266UL,   469003UL, 218226UL, 446558UL, 155324UL, 89116UL,  457308UL, 385487UL, 230845UL, 381427UL, 39566UL,
+    383554UL, 258858UL, 150019UL, 473663UL, 519657UL, 95844UL,  196841UL, 169703UL, 195868UL, 94122UL,  326022UL,
+    219225UL, 110220UL, 521443UL, 168039UL, 131159UL, 78008UL,  230146UL, 359002UL, 170898UL, 281648UL, 228804UL,
+    453553UL, 107150UL, 198425UL, 246595UL, 461002UL, 383221UL, 183295UL, 142678UL, 398598UL, 518494UL, 176584UL,
+    104321UL, 247187UL, 116649UL, 197909UL, 137683UL, 312075UL, 291938UL, 20066UL,  300148UL, 6524UL,   161434UL,
+    278162UL, 403769UL, 193777UL, 58400UL,  466060UL, 220791UL, 312586UL, 135102UL, 431826UL, 455903UL, 456068UL,
+    392352UL, 435806UL, 12950UL,  194666UL, 403172UL, 449604UL, 232298UL, 458533UL, 197710UL, 520676UL, 258738UL,
+    5033UL,   140351UL, 278818UL, 289755UL, 475579UL, 21551UL,  414708UL, 296356UL, 69320UL,  240822UL, 112955UL,
+    441552UL, 219018UL, 146428UL, 473322UL, 418055UL, 12234UL,  16784UL,  390845UL, 278580UL, 151270UL, 479404UL,
+    431631UL, 281489UL, 198025UL, 205573UL, 410464UL, 454515UL, 74073UL,  454825UL, 382225UL, 505814UL, 484961UL,
+    185194UL, 144285UL, 317119UL, 93390UL,  190494UL, 227391UL, 397869UL, 195707UL, 314782UL, 150318UL, 264524UL,
+    275630UL, 267490UL, 434932UL, 358919UL, 369691UL, 394146UL, 260050UL, 165982UL, 23174UL,  594UL,    266196UL,
+    247312UL, 15760UL,  206395UL, 23186UL,  401629UL, 16190UL,  311124UL, 425825UL, 452002UL, 426302UL, 4335UL,
+    5903UL,   153679UL, 56525UL,  367038UL, 358650UL, 126904UL, 5653UL,   155856UL, 382511UL, 227668UL, 102026UL,
+    98580UL,  287270UL, 391989UL, 290542UL, 382292UL, 55725UL,  115768UL, 25300UL,  499211UL, 134014UL, 238922UL,
+    378640UL, 429444UL, 15850UL,  213488UL, 427534UL, 47017UL,  333114UL, 393233UL, 463203UL, 412034UL, 150723UL,
+    231665UL, 45251UL,  163315UL, 301633UL, 460127UL, 393180UL, 202232UL, 269646UL, 5419UL,   256748UL, 506572UL,
+    359072UL, 310389UL, 65990UL,  491988UL, 216638UL, 126753UL, 3777UL,   194654UL, 86136UL,  441923UL, 449699UL,
+    336397UL, 158284UL, 80007UL,  359449UL, 264503UL, 96782UL,  258965UL, 422135UL, 136692UL, 511208UL, 480583UL,
+    413795UL, 189839UL, 252253UL, 25062UL,  337259UL, 175798UL, 495643UL, 150211UL, 455770UL, 86015UL,  77990UL,
+    213007UL, 369322UL, 228474UL, 206526UL, 208234UL, 343347UL, 500626UL, 178460UL, 218592UL, 90633UL,  43067UL,
+    267958UL, 225893UL, 55451UL,  187404UL, 371928UL, 296813UL, 389681UL, 220844UL, 208677UL, 23548UL,  43178UL,
+    327935UL, 448158UL, 451528UL, 47171UL,  171117UL, 251489UL, 508719UL, 291673UL, 210231UL, 266980UL, 125353UL,
+    213247UL, 498709UL, 19633UL,  446485UL, 339180UL, 371553UL, 470278UL, 214526UL, 52977UL,  64014UL,  487001UL,
+    480220UL, 369491UL, 457876UL, 309872UL, 313901UL, 50790UL,  453957UL, 371311UL, 236615UL, 50841UL,  155754UL,
+    358362UL, 309864UL, 3430UL,   292771UL, 358977UL, 224111UL, 92453UL,  70238UL,  345935UL, 82699UL,  296810UL,
+    398232UL, 294481UL, 248968UL, 120503UL, 408112UL, 10171UL,  9702UL,   519518UL, 221404UL, 278457UL, 193695UL,
+    313433UL, 31485UL,  313132UL, 292783UL, 474163UL, 401830UL, 96046UL,  268909UL, 351833UL, 326332UL, 267272UL,
+    485899UL, 452079UL, 438561UL, 73071UL,  368905UL, 342584UL, 412191UL, 437739UL, 42245UL,  487682UL, 360610UL,
+    41582UL,  81540UL,  359589UL, 456514UL, 473899UL, 486218UL, 473408UL, 374684UL, 136579UL, 393157UL, 2438UL,
+    226075UL, 141343UL, 103330UL, 373551UL, 440586UL, 437646UL, 337570UL, 397993UL, 81173UL,  43372UL,  115468UL,
+    503144UL, 245292UL, 222358UL, 148698UL, 222921UL, 16223UL,  213680UL, 359982UL, 31423UL,  214991UL, 322963UL,
+    31800UL,  54425UL,  88194UL,  208899UL, 443123UL, 81561UL,  454539UL, 445910UL, 494418UL, 179961UL, 167220UL,
+    438081UL, 115096UL, 321567UL, 121195UL, 418718UL, 352694UL, 77935UL,  44520UL,  102566UL, 475730UL, 172735UL,
+    286608UL, 517198UL, 120018UL, 115911UL, 57380UL,  165845UL, 2519UL,   15530UL,  322703UL, 169458UL, 27657UL,
+    4652UL,   271550UL, 275349UL, 266011UL, 195526UL, 232431UL, 44349UL,  228923UL, 364478UL, 238800UL, 465143UL,
+    143831UL, 451036UL, 440577UL, 325060UL, 457699UL, 157306UL, 92138UL,  246700UL, 104748UL, 51737UL,  93327UL,
+    475414UL, 170430UL, 285541UL, 412713UL, 284853UL, 518824UL, 435068UL, 236619UL, 236974UL, 447529UL, 414443UL,
+    241879UL, 29502UL,  225181UL, 454661UL, 224666UL, 442037UL, 218972UL, 4460UL,   312796UL, 410337UL, 312967UL,
+    319048UL, 486321UL, 228741UL, 214639UL, 419445UL, 216752UL, 60067UL,  390292UL, 480082UL, 202810UL, 497716UL,
+    512649UL, 480935UL, 479681UL, 292206UL, 191758UL, 273378UL, 10270UL,  23255UL,  75783UL,  521202UL, 314827UL,
+    130616UL, 166674UL, 50887UL,  358047UL, 500420UL, 42339UL,  432498UL, 97594UL,  227556UL, 436103UL, 344839UL,
+    254948UL, 90744UL,  516778UL, 380814UL, 321581UL, 42345UL,  193871UL, 452397UL, 253689UL, 23839UL,  14902UL,
+    158097UL, 242271UL, 192937UL, 124757UL, 324371UL, 296949UL, 206111UL, 462839UL, 141577UL, 221554UL, 360239UL,
+    324250UL, 472796UL, 396925UL, 122978UL, 490271UL, 132770UL, 338599UL, 221007UL, 185021UL, 416372UL, 112798UL,
+    26702UL,  79707UL,  3811UL,   489469UL, 378922UL, 19187UL,  491708UL, 506796UL, 126888UL, 168625UL, 261361UL,
+    418617UL, 84068UL,  114586UL, 327556UL, 471681UL, 179458UL, 396714UL, 290574UL, 466137UL, 218041UL, 394135UL,
+    77035UL,  314529UL, 32067UL,  339712UL, 240437UL, 465245UL, 60823UL,  516272UL, 381931UL, 71563UL,  413249UL,
+    279794UL, 288123UL, 516796UL, 212288UL, 357299UL, 222534UL, 414152UL, 516512UL, 172550UL, 247224UL, 387172UL,
+    142012UL, 429971UL, 360536UL, 402362UL, 447678UL, 336209UL, 284038UL, 161744UL, 324659UL, 332904UL, 240914UL,
+    110228UL, 132871UL, 285035UL, 171865UL, 505230UL, 392026UL, 220473UL, 249763UL, 118360UL, 71379UL,  358004UL,
+    293508UL, 50993UL,  402359UL, 482190UL, 192175UL, 196392UL, 304452UL, 371270UL, 42438UL,  54293UL,  134359UL,
+    210827UL, 13633UL,  23070UL,  108794UL, 342576UL, 110273UL, 7389UL,   126349UL, 45483UL,  180134UL, 65333UL,
+    55054UL,  489431UL, 231091UL, 58833UL,  237787UL, 424931UL, 510496UL, 171811UL, 12923UL,  122490UL, 304183UL,
+    82242UL,  157693UL, 3931UL,   488141UL, 501726UL, 262705UL, 397275UL, 469398UL, 100092UL, 121603UL, 395448UL,
+    361558UL, 75210UL,  478588UL, 452182UL, 168232UL, 503898UL, 139839UL, 102444UL, 55360UL,  154115UL, 364677UL,
+    35563UL,  250275UL, 335766UL, 467048UL, 470579UL, 21837UL,  401849UL, 98093UL,  354668UL, 172063UL, 454309UL,
+    95473UL,  495859UL, 264584UL, 273320UL, 120137UL, 500679UL, 481928UL, 322693UL, 264912UL, 268634UL, 79751UL,
+    387501UL, 225022UL, 119393UL, 35229UL,  177849UL, 359507UL, 143717UL, 390716UL, 81186UL,  171596UL, 422876UL,
+    112422UL, 384615UL, 397329UL, 504419UL, 455887UL, 314485UL, 300370UL, 407411UL, 344582UL, 237939UL, 300407UL,
+    102752UL, 497530UL, 401790UL, 261268UL, 216135UL, 2365UL,   432233UL, 268490UL, 170895UL, 216345UL, 75416UL,
+    166273UL, 421211UL, 237652UL, 223068UL, 189503UL, 495473UL, 396156UL, 12623UL,  338462UL, 431920UL, 511463UL,
+    405655UL, 167515UL, 497041UL, 108987UL, 211099UL, 153470UL, 96430UL,  28907UL,  12754UL,  366105UL, 50471UL,
+    121271UL, 458549UL, 234519UL, 145073UL, 318018UL, 113513UL, 17653UL,  210538UL, 183488UL, 499810UL, 68268UL,
+    369179UL, 436311UL, 306744UL, 429566UL, 359510UL, 500202UL, 67010UL,  493579UL, 184121UL, 450152UL, 69800UL,
+    82862UL,
+};
+uint32_t rand_arr_20_b20_w32_arr[1024] = {
+    35062UL,   49920UL,   554790UL,  796847UL,  753988UL,  716076UL,  349780UL,  887169UL,  582898UL,  574934UL,
+    435971UL,  558785UL,  332828UL,  654584UL,  702730UL,  703653UL,  392832UL,  619181UL,  312364UL,  231731UL,
+    770352UL,  627567UL,  653756UL,  516178UL,  782574UL,  55448UL,   624797UL,  127178UL,  36028UL,   246876UL,
+    244968UL,  134030UL,  207491UL,  647720UL,  907568UL,  868748UL,  345100UL,  413677UL,  419049UL,  410148UL,
+    857711UL,  988991UL,  551397UL,  542657UL,  548663UL,  879758UL,  99594UL,   633987UL,  328381UL,  264474UL,
+    632645UL,  548853UL,  1003734UL, 203973UL,  336622UL,  554140UL,  172124UL,  949528UL,  776798UL,  74221UL,
+    249125UL,  164397UL,  594435UL,  1016948UL, 307014UL,  830341UL,  401840UL,  266669UL,  364676UL,  880811UL,
+    123154UL,  909529UL,  863514UL,  821289UL,  328719UL,  571516UL,  240737UL,  524947UL,  468596UL,  543309UL,
+    669655UL,  337822UL,  833147UL,  495867UL,  64563UL,   910763UL,  227430UL,  386647UL,  621800UL,  331678UL,
+    199022UL,  725081UL,  376174UL,  290650UL,  361775UL,  816997UL,  164619UL,  323921UL,  278433UL,  9854UL,
+    366710UL,  534653UL,  789423UL,  33844UL,   596032UL,  472394UL,  447815UL,  989644UL,  607269UL,  11455UL,
+    902741UL,  69829UL,   138030UL,  277396UL,  373397UL,  295856UL,  321599UL,  434014UL,  747192UL,  259686UL,
+    599720UL,  945612UL,  804474UL,  960279UL,  989960UL,  1003479UL, 576288UL,  633303UL,  894683UL,  616223UL,
+    45021UL,   841006UL,  99169UL,   74335UL,   206120UL,  66201UL,   902967UL,  283272UL,  135623UL,  699852UL,
+    66265UL,   818569UL,  592093UL,  638879UL,  910254UL,  27130UL,   162116UL,  540321UL,  250839UL,  180037UL,
+    962524UL,  589266UL,  773452UL,  605185UL,  781761UL,  580506UL,  578309UL,  781519UL,  179209UL,  32160UL,
+    220847UL,  1001678UL, 355562UL,  538506UL,  985997UL,  20507UL,   464322UL,  315207UL,  952887UL,  40221UL,
+    456156UL,  397872UL,  475942UL,  730681UL,  640852UL,  158968UL,  280158UL,  572882UL,  505749UL,  199769UL,
+    923683UL,  311144UL,  227935UL,  506656UL,  330152UL,  314558UL,  734058UL,  559190UL,  166520UL,  935551UL,
+    48366UL,   859787UL,  392658UL,  654256UL,  460457UL,  406222UL,  730939UL,  615111UL,  604239UL,  282294UL,
+    694560UL,  611894UL,  594487UL,  83908UL,   670479UL,  443531UL,  363141UL,  358942UL,  713702UL,  429929UL,
+    848332UL,  422762UL,  947814UL,  679539UL,  780314UL,  800166UL,  330654UL,  874663UL,  225669UL,  589330UL,
+    499351UL,  935268UL,  1044563UL, 497402UL,  616740UL,  151338UL,  606694UL,  907737UL,  322357UL,  1002652UL,
+    896424UL,  181506UL,  1021259UL, 182637UL,  904071UL,  110247UL,  139791UL,  598627UL,  149572UL,  131015UL,
+    83022UL,   186247UL,  409094UL,  4698UL,    306107UL,  88304UL,   609729UL,  891490UL,  397422UL,  283333UL,
+    347989UL,  650950UL,  202729UL,  258703UL,  896815UL,  237588UL,  909311UL,  331350UL,  392955UL,  62390UL,
+    106585UL,  1016054UL, 56784UL,   137630UL,  65688UL,   108314UL,  428044UL,  247596UL,  502969UL,  867751UL,
+    202730UL,  395795UL,  296154UL,  660734UL,  459021UL,  915106UL,  111404UL,  445611UL,  881721UL,  192904UL,
+    392160UL,  558190UL,  153915UL,  898546UL,  83840UL,   734114UL,  88668UL,   36927UL,   548240UL,  690521UL,
+    259242UL,  717769UL,  468166UL,  614203UL,  6744UL,    311534UL,  539209UL,  927525UL,  28636UL,   419290UL,
+    731536UL,  1040789UL, 545189UL,  133272UL,  623635UL,  543142UL,  47225UL,   690449UL,  551443UL,  621067UL,
+    821726UL,  343126UL,  292609UL,  318004UL,  707090UL,  907331UL,  642832UL,  1023866UL, 312571UL,  1000356UL,
+    127905UL,  653585UL,  647190UL,  440103UL,  969105UL,  827843UL,  115419UL,  973592UL,  518250UL,  627321UL,
+    791079UL,  873675UL,  439573UL,  1047978UL, 314109UL,  913794UL,  367660UL,  233636UL,  817908UL,  732774UL,
+    28237UL,   94707UL,   834609UL,  181487UL,  975448UL,  854171UL,  270355UL,  440177UL,  834187UL,  404064UL,
+    453889UL,  934049UL,  653913UL,  147001UL,  176511UL,  278140UL,  385376UL,  544639UL,  606583UL,  738705UL,
+    731318UL,  77992UL,   632601UL,  348013UL,  897950UL,  231054UL,  985004UL,  915808UL,  50277UL,   754675UL,
+    1008067UL, 184772UL,  374101UL,  359254UL,  802833UL,  735055UL,  967349UL,  657443UL,  1020945UL, 932442UL,
+    970513UL,  790326UL,  1037162UL, 92625UL,   827233UL,  137301UL,  641987UL,  231204UL,  240506UL,  302327UL,
+    22580UL,   428349UL,  386363UL,  24549UL,   777395UL,  34826UL,   283697UL,  434578UL,  972515UL,  666318UL,
+    161793UL,  824906UL,  936637UL,  818068UL,  893283UL,  402308UL,  466697UL,  616414UL,  820099UL,  215246UL,
+    295533UL,  748194UL,  112514UL,  514838UL,  949054UL,  839542UL,  423858UL,  113754UL,  292780UL,  217690UL,
+    664617UL,  571593UL,  451099UL,  305439UL,  618661UL,  413169UL,  150235UL,  975553UL,  585484UL,  950635UL,
+    293688UL,  366286UL,  742978UL,  597629UL,  519237UL,  759132UL,  144497UL,  114646UL,  1035005UL, 654368UL,
+    952249UL,  191014UL,  775605UL,  691387UL,  65378UL,   762619UL,  149031UL,  105815UL,  409599UL,  647552UL,
+    836239UL,  696723UL,  884203UL,  902696UL,  175093UL,  47500UL,   649047UL,  64334UL,   724233UL,  539735UL,
+    895687UL,  930881UL,  930578UL,  566580UL,  914860UL,  72091UL,   273055UL,  304594UL,  63911UL,   459223UL,
+    423159UL,  412236UL,  956645UL,  523952UL,  674675UL,  21798UL,   875258UL,  174382UL,  550895UL,  223063UL,
+    712943UL,  194398UL,  1039126UL, 639748UL,  534972UL,  505907UL,  825912UL,  636528UL,  1025965UL, 257105UL,
+    107790UL,  978398UL,  593836UL,  673039UL,  199218UL,  700878UL,  93643UL,   502077UL,  443736UL,  314155UL,
+    924952UL,  782327UL,  41137UL,   168608UL,  884646UL,  551217UL,  795350UL,  637197UL,  789860UL,  369458UL,
+    365417UL,  891046UL,  532507UL,  68230UL,   383276UL,  222087UL,  191606UL,  835912UL,  962808UL,  465553UL,
+    676334UL,  278911UL,  405772UL,  556698UL,  102724UL,  598402UL,  948212UL,  96626UL,   195458UL,  82371UL,
+    550162UL,  312436UL,  349132UL,  271032UL,  236155UL,  699659UL,  303283UL,  235913UL,  575399UL,  940878UL,
+    450516UL,  604264UL,  61514UL,   1023165UL, 639907UL,  809413UL,  868274UL,  702924UL,  22069UL,   82752UL,
+    714972UL,  827709UL,  161296UL,  982270UL,  1037089UL, 865831UL,  52358UL,   949462UL,  324710UL,  654797UL,
+    801524UL,  562929UL,  952423UL,  1001149UL, 402528UL,  863051UL,  861828UL,  869623UL,  361414UL,  234350UL,
+    288136UL,  538262UL,  1015994UL, 94695UL,   757288UL,  498542UL,  338856UL,  1037401UL, 114810UL,  815567UL,
+    714633UL,  987375UL,  664464UL,  922064UL,  910703UL,  864346UL,  375153UL,  113502UL,  557746UL,  749564UL,
+    36482UL,   485452UL,  390896UL,  573700UL,  564561UL,  121081UL,  268759UL,  256771UL,  516934UL,  129723UL,
+    87946UL,   681639UL,  772301UL,  814644UL,  592766UL,  675272UL,  874376UL,  1034749UL, 364832UL,  998454UL,
+    999592UL,  410317UL,  361626UL,  882096UL,  572323UL,  95137UL,   202116UL,  310532UL,  705713UL,  18942UL,
+    217452UL,  837172UL,  616741UL,  439716UL,  903142UL,  986568UL,  177709UL,  110941UL,  248521UL,  628500UL,
+    926925UL,  285560UL,  12163UL,   762727UL,  916950UL,  290761UL,  133315UL,  25905UL,   160394UL,  597668UL,
+    876422UL,  426655UL,  361167UL,  1017056UL, 56061UL,   59059UL,   597659UL,  524920UL,  51599UL,   648531UL,
+    821408UL,  442663UL,  934998UL,  974718UL,  845011UL,  972514UL,  818923UL,  328237UL,  744945UL,  409560UL,
+    312893UL,  276032UL,  604447UL,  177553UL,  987985UL,  107754UL,  515447UL,  418850UL,  977652UL,  404408UL,
+    963249UL,  418463UL,  826524UL,  332611UL,  622494UL,  944586UL,  129331UL,  97326UL,   752188UL,  465777UL,
+    536050UL,  425776UL,  404455UL,  821349UL,  904323UL,  146528UL,  205582UL,  858176UL,  153590UL,  2441UL,
+    392351UL,  733253UL,  428092UL,  101668UL,  51250UL,   604637UL,  58281UL,   847673UL,  123582UL,  210102UL,
+    222546UL,  328793UL,  362874UL,  866050UL,  1033246UL, 274348UL,  12488UL,   71458UL,   714631UL,  137557UL,
+    84786UL,   676103UL,  664624UL,  268948UL,  469881UL,  478407UL,  48175UL,   131138UL,  406675UL,  1003279UL,
+    165706UL,  1011525UL, 178152UL,  984363UL,  630526UL,  509010UL,  78755UL,   506256UL,  707484UL,  933281UL,
+    940353UL,  122639UL,  1012442UL, 762525UL,  931224UL,  198431UL,  365300UL,  101397UL,  372674UL,  126688UL,
+    250347UL,  703886UL,  329681UL,  310286UL,  794003UL,  531316UL,  1002029UL, 376489UL,  80331UL,   840531UL,
+    129744UL,  539380UL,  141017UL,  818214UL,  113971UL,  819748UL,  148170UL,  909789UL,  567082UL,  670342UL,
+    972343UL,  639887UL,  387082UL,  576951UL,  987865UL,  985940UL,  174251UL,  591027UL,  422035UL,  710468UL,
+    570786UL,  898467UL,  783315UL,  934358UL,  808114UL,  10938UL,   811848UL,  966364UL,  482628UL,  179697UL,
+    389909UL,  624122UL,  924393UL,  408101UL,  915128UL,  983706UL,  287931UL,  570957UL,  414353UL,  931665UL,
+    427994UL,  303681UL,  809567UL,  265996UL,  885256UL,  779708UL,  342821UL,  114315UL,  637131UL,  414323UL,
+    208009UL,  563833UL,  95343UL,   44924UL,   64484UL,   693817UL,  136047UL,  655873UL,  139470UL,  982031UL,
+    131979UL,  104754UL,  18451UL,   838551UL,  1022594UL, 151160UL,  41962UL,   714327UL,  140557UL,  488576UL,
+    296518UL,  68751UL,   744728UL,  924545UL,  338958UL,  470740UL,  429983UL,  130111UL,  218245UL,  286062UL,
+    864103UL,  1041978UL, 297700UL,  215823UL,  685287UL,  335957UL,  1016277UL, 694367UL,  943529UL,  204415UL,
+    118269UL,  632032UL,  73643UL,   703480UL,  773533UL,  49000UL,   78861UL,   194956UL,  198900UL,  355741UL,
+    166462UL,  326152UL,  94524UL,   6097UL,    379956UL,  655435UL,  791914UL,  684857UL,  226206UL,  70015UL,
+    285712UL,  815405UL,  898795UL,  745366UL,  22393UL,   21586UL,   594200UL,  89783UL,   563577UL,  874061UL,
+    510973UL,  593940UL,  982320UL,  338213UL,  875584UL,  658637UL,  721211UL,  857008UL,  528306UL,  334275UL,
+    931802UL,  346979UL,  908660UL,  767450UL,  763390UL,  678129UL,  893923UL,  292179UL,  419193UL,  204913UL,
+    4377UL,    326143UL,  835286UL,  1004537UL, 657341UL,  1024134UL, 603625UL,  784001UL,  216017UL,  599137UL,
+    850208UL,  93557UL,   159871UL,  619119UL,  607738UL,  1008057UL, 716094UL,  465540UL,  1044304UL, 52726UL,
+    793816UL,  909721UL,  241229UL,  113108UL,  818510UL,  576894UL,  68038UL,   362953UL,  591873UL,  896534UL,
+    293240UL,  782818UL,  875780UL,  944894UL,  949201UL,  245978UL,  642775UL,  415738UL,  117289UL,  938923UL,
+    158725UL,  811633UL,  70184UL,   812051UL,  202387UL,  397515UL,  596237UL,  7000UL,    404785UL,  855542UL,
+    815541UL,  1012147UL, 829629UL,  948648UL,  253530UL,  849469UL,  640053UL,  544635UL,  850070UL,  385906UL,
+    319980UL,  439674UL,  894667UL,  203449UL,  929418UL,  98582UL,   141651UL,  460793UL,  507031UL,  243763UL,
+    35536UL,   559338UL,  419413UL,  138844UL,  708266UL,  715915UL,  576572UL,  792861UL,  382472UL,  766731UL,
+    187143UL,  875275UL,  140374UL,  120775UL,  944088UL,  405435UL,  452819UL,  278774UL,  617932UL,  278057UL,
+    885433UL,  997604UL,  336462UL,  739795UL,  251807UL,  108535UL,  738679UL,  290945UL,  876256UL,  733981UL,
+    277760UL,  518314UL,  566953UL,  861938UL,  144893UL,  316545UL,  884854UL,  569484UL,  75006UL,   939895UL,
+    947278UL,  894742UL,  976772UL,  301789UL,  696299UL,  978708UL,  1007984UL, 255572UL,  236900UL,  482650UL,
+    22713UL,   134385UL,  586576UL,  322728UL,  281767UL,  374053UL,  54519UL,   759864UL,  1013728UL, 60001UL,
+    827931UL,  626309UL,  504585UL,  1040754UL,
+};
+uint32_t rand_arr_21_b21_w32_arr[1024] = {
+    1142451UL, 262412UL,  12784UL,   1561867UL, 446801UL,  398260UL,  1003656UL, 84944UL,   1123252UL, 1179572UL,
+    2056097UL, 94353UL,   1351473UL, 310675UL,  1668174UL, 175470UL,  954139UL,  663023UL,  1363503UL, 1308925UL,
+    1869203UL, 911402UL,  585629UL,  354500UL,  846093UL,  641329UL,  1961849UL, 1319919UL, 1409132UL, 848488UL,
+    1912467UL, 2066620UL, 1603714UL, 232053UL,  473385UL,  1524003UL, 1988030UL, 741397UL,  194522UL,  1562968UL,
+    1337624UL, 240777UL,  1816959UL, 937095UL,  300565UL,  327773UL,  300912UL,  956042UL,  611509UL,  1128524UL,
+    81408UL,   323418UL,  520580UL,  645651UL,  894160UL,  476869UL,  259021UL,  1782755UL, 357729UL,  903093UL,
+    1906878UL, 1221884UL, 778190UL,  1617847UL, 962201UL,  328616UL,  417424UL,  1636289UL, 1329298UL, 2072084UL,
+    1870289UL, 488498UL,  1283409UL, 1872829UL, 498455UL,  1655279UL, 1229920UL, 306594UL,  1650032UL, 290155UL,
+    1209343UL, 253389UL,  1334063UL, 1392989UL, 1595128UL, 886468UL,  1677981UL, 2037689UL, 168728UL,  946742UL,
+    469727UL,  1258708UL, 292731UL,  520458UL,  1180659UL, 271839UL,  1574968UL, 1227986UL, 67447UL,   1130544UL,
+    846948UL,  330513UL,  1119729UL, 733835UL,  883367UL,  388480UL,  1846378UL, 399280UL,  1933598UL, 30001UL,
+    987361UL,  165253UL,  2076593UL, 795914UL,  1218241UL, 1784788UL, 483892UL,  1706365UL, 671970UL,  820826UL,
+    386380UL,  1922111UL, 1528910UL, 1469053UL, 947571UL,  1732689UL, 52145UL,   468930UL,  1234889UL, 1574863UL,
+    144835UL,  1006511UL, 284768UL,  2017383UL, 1323349UL, 858592UL,  484016UL,  189503UL,  1027365UL, 1642407UL,
+    431987UL,  334107UL,  307348UL,  1371730UL, 1601666UL, 1923437UL, 1945435UL, 842849UL,  632508UL,  1157911UL,
+    1973276UL, 2032629UL, 1689045UL, 1863320UL, 809548UL,  1038122UL, 341560UL,  1664998UL, 1359465UL, 1146949UL,
+    1351412UL, 1371299UL, 796816UL,  1992100UL, 1271523UL, 1645599UL, 675017UL,  505562UL,  1012507UL, 1741081UL,
+    1277407UL, 373708UL,  790161UL,  1649995UL, 1787106UL, 1477659UL, 1711640UL, 839194UL,  1291755UL, 764246UL,
+    1468513UL, 25721UL,   146105UL,  22837UL,   1866863UL, 23600UL,   61765UL,   1897327UL, 564533UL,  420106UL,
+    1737092UL, 575147UL,  863999UL,  1128000UL, 561889UL,  1021125UL, 1618866UL, 1657212UL, 1447173UL, 974376UL,
+    838917UL,  1157616UL, 1626494UL, 116026UL,  859448UL,  778018UL,  979190UL,  1494922UL, 1787599UL, 1727179UL,
+    1283125UL, 2057160UL, 898765UL,  364172UL,  19705UL,   1833412UL, 2045216UL, 1369104UL, 1886413UL, 1726321UL,
+    167782UL,  524479UL,  415936UL,  21113UL,   1555713UL, 1415821UL, 482040UL,  1219759UL, 121968UL,  1292915UL,
+    1541647UL, 976614UL,  1690810UL, 1715324UL, 973837UL,  1815655UL, 315515UL,  1927592UL, 1428426UL, 994590UL,
+    570312UL,  522374UL,  351310UL,  495591UL,  1963297UL, 657995UL,  782483UL,  877789UL,  1138147UL, 1293200UL,
+    1113030UL, 1162381UL, 1378126UL, 515602UL,  1477506UL, 1485099UL, 2029914UL, 1766398UL, 808143UL,  2071116UL,
+    40190UL,   1625471UL, 1747658UL, 1071486UL, 1417845UL, 306617UL,  24767UL,   1263674UL, 1891221UL, 740247UL,
+    169420UL,  818163UL,  215470UL,  609926UL,  183189UL,  154594UL,  1810209UL, 1617322UL, 297829UL,  1515393UL,
+    793920UL,  241974UL,  2035487UL, 953375UL,  110753UL,  196513UL,  2079007UL, 2055435UL, 1526555UL, 827011UL,
+    1106310UL, 932841UL,  1671573UL, 794582UL,  1167819UL, 1856733UL, 475587UL,  530498UL,  626175UL,  424429UL,
+    938922UL,  1469702UL, 261351UL,  1899497UL, 479744UL,  1720031UL, 527875UL,  805728UL,  954368UL,  1329752UL,
+    409342UL,  90185UL,   1142087UL, 1830939UL, 1745353UL, 523066UL,  243550UL,  1664056UL, 986637UL,  694361UL,
+    1503815UL, 439299UL,  1279174UL, 430744UL,  1327012UL, 542353UL,  1206201UL, 725698UL,  108112UL,  1511492UL,
+    1819232UL, 1892195UL, 435645UL,  1589447UL, 1800771UL, 1953248UL, 2050590UL, 1914575UL, 754515UL,  1695799UL,
+    1714751UL, 251261UL,  1594146UL, 1116369UL, 1334930UL, 1261973UL, 2010301UL, 593145UL,  1120953UL, 1137924UL,
+    1051552UL, 1551162UL, 1226654UL, 481853UL,  1733187UL, 1856239UL, 1540000UL, 2007845UL, 1753893UL, 1076897UL,
+    1547708UL, 943808UL,  1278779UL, 1538978UL, 1368428UL, 403960UL,  922330UL,  268991UL,  1978478UL, 539579UL,
+    634644UL,  942361UL,  1245806UL, 496660UL,  678081UL,  701391UL,  1908520UL, 1076152UL, 1862247UL, 1717933UL,
+    516246UL,  1593601UL, 1099194UL, 229783UL,  585894UL,  725569UL,  928016UL,  332830UL,  281775UL,  668637UL,
+    824440UL,  872572UL,  1938802UL, 1564118UL, 640618UL,  1485056UL, 621892UL,  149939UL,  454534UL,  833642UL,
+    795581UL,  811398UL,  539468UL,  768603UL,  1397590UL, 758628UL,  1434045UL, 1840287UL, 1595871UL, 21777UL,
+    568561UL,  1860206UL, 1738846UL, 726110UL,  1103016UL, 1218762UL, 426046UL,  1424593UL, 345305UL,  300582UL,
+    1580618UL, 930759UL,  356046UL,  144594UL,  1129525UL, 1568038UL, 1935329UL, 620488UL,  622246UL,  555973UL,
+    1875652UL, 362999UL,  1103052UL, 115215UL,  1096652UL, 841646UL,  1870110UL, 1971021UL, 1878449UL, 164670UL,
+    1513551UL, 1679927UL, 1815033UL, 1319247UL, 72871UL,   207134UL,  1093179UL, 324025UL,  276407UL,  344447UL,
+    1086412UL, 997273UL,  1767744UL, 1360740UL, 1421772UL, 2039295UL, 465667UL,  1857168UL, 43734UL,   343404UL,
+    892547UL,  5473UL,    1108273UL, 562760UL,  1485577UL, 1619704UL, 1372893UL, 278698UL,  237993UL,  2059994UL,
+    1553073UL, 939390UL,  2040223UL, 931587UL,  2038117UL, 17887UL,   1254783UL, 1679954UL, 1702741UL, 1138135UL,
+    998379UL,  1122742UL, 1575690UL, 1535973UL, 1419122UL, 651150UL,  720106UL,  283027UL,  560605UL,  616321UL,
+    718381UL,  1925339UL, 1101186UL, 382733UL,  1618213UL, 1830880UL, 910956UL,  565645UL,  1422564UL, 815578UL,
+    1920329UL, 2049568UL, 1779096UL, 1491891UL, 595006UL,  914584UL,  511182UL,  508266UL,  66022UL,   1823495UL,
+    1499435UL, 963395UL,  1168224UL, 1712068UL, 1420180UL, 781878UL,  1785455UL, 1969771UL, 1434915UL, 425944UL,
+    499877UL,  1134212UL, 721927UL,  1829156UL, 1800155UL, 631436UL,  1376330UL, 1851157UL, 782251UL,  1606807UL,
+    32958UL,   966750UL,  778544UL,  1736052UL, 425891UL,  1252627UL, 203338UL,  2072935UL, 1725394UL, 1481381UL,
+    539014UL,  1470083UL, 1527055UL, 344173UL,  317621UL,  1812422UL, 1883261UL, 1075735UL, 1346301UL, 44656UL,
+    519450UL,  2030852UL, 185468UL,  2000662UL, 1205185UL, 1681730UL, 428656UL,  1224862UL, 197385UL,  225810UL,
+    920662UL,  996339UL,  2089257UL, 501441UL,  1124541UL, 1097679UL, 1700326UL, 1824410UL, 1014253UL, 865337UL,
+    590752UL,  429874UL,  1480444UL, 765730UL,  1679549UL, 51415UL,   1149913UL, 1502947UL, 1113380UL, 1956756UL,
+    525258UL,  1376317UL, 1605105UL, 564103UL,  1493121UL, 802923UL,  978359UL,  1182440UL, 1427946UL, 584450UL,
+    391792UL,  1670505UL, 1611145UL, 45608UL,   1479334UL, 195434UL,  218022UL,  511889UL,  204844UL,  1623231UL,
+    484383UL,  105516UL,  2083281UL, 1406823UL, 1902632UL, 2085662UL, 391539UL,  698461UL,  1295763UL, 1267796UL,
+    348303UL,  638721UL,  216167UL,  259925UL,  1142937UL, 189526UL,  1249318UL, 1385987UL, 1348648UL, 1926838UL,
+    1987157UL, 641213UL,  832313UL,  1439366UL, 662660UL,  2075355UL, 755859UL,  784157UL,  361369UL,  100114UL,
+    1648915UL, 491487UL,  716475UL,  263269UL,  1804057UL, 1540510UL, 1631322UL, 1713528UL, 910459UL,  236301UL,
+    147553UL,  81439UL,   1890271UL, 888798UL,  1899352UL, 1422585UL, 127629UL,  1074784UL, 1562386UL, 1886427UL,
+    1469311UL, 1244338UL, 1472521UL, 1478641UL, 1268953UL, 1564996UL, 1055725UL, 1153707UL, 720152UL,  1794046UL,
+    770282UL,  1342218UL, 1662106UL, 366583UL,  114322UL,  1714030UL, 1398443UL, 581355UL,  600554UL,  2081037UL,
+    369635UL,  1633474UL, 1461520UL, 1779307UL, 1551654UL, 1050781UL, 734304UL,  269284UL,  259108UL,  458060UL,
+    470109UL,  1949438UL, 537376UL,  1512499UL, 1506124UL, 1833400UL, 1454607UL, 1503633UL, 1635543UL, 589512UL,
+    493209UL,  83630UL,   1662265UL, 2011599UL, 1109133UL, 1185248UL, 1473755UL, 796751UL,  728473UL,  2033529UL,
+    1293675UL, 720350UL,  1269308UL, 222657UL,  1933052UL, 1635014UL, 1036276UL, 7138UL,    1149167UL, 1210712UL,
+    696138UL,  1895901UL, 162040UL,  852478UL,  342010UL,  1940795UL, 992926UL,  1683776UL, 850149UL,  625815UL,
+    1245164UL, 1537029UL, 1404659UL, 1409459UL, 1530617UL, 137347UL,  1651082UL, 1813224UL, 225711UL,  1800496UL,
+    858717UL,  439543UL,  217402UL,  1891633UL, 1623392UL, 246212UL,  1366034UL, 500863UL,  1531UL,    654541UL,
+    1321113UL, 1024829UL, 1041674UL, 985558UL,  291628UL,  32719UL,   818912UL,  526270UL,  821615UL,  1112302UL,
+    22003UL,   2003975UL, 749604UL,  1086642UL, 1090064UL, 722735UL,  896850UL,  430186UL,  3352UL,    1406497UL,
+    35368UL,   756628UL,  534139UL,  221264UL,  773219UL,  545038UL,  1519695UL, 2065857UL, 367260UL,  415709UL,
+    1861761UL, 1947057UL, 792224UL,  980030UL,  621248UL,  2028283UL, 1159079UL, 1582894UL, 1201877UL, 383835UL,
+    1701041UL, 936318UL,  735106UL,  1670245UL, 1384801UL, 865794UL,  1992449UL, 1481139UL, 1831579UL, 1032406UL,
+    298313UL,  65309UL,   986964UL,  1647900UL, 491444UL,  241014UL,  655531UL,  1374710UL, 640176UL,  1460363UL,
+    1303104UL, 456535UL,  583283UL,  1479697UL, 705684UL,  1714745UL, 1868475UL, 272472UL,  1148200UL, 403055UL,
+    952678UL,  66175UL,   3794UL,    64675UL,   1662084UL, 1210180UL, 137557UL,  1055799UL, 785234UL,  1694895UL,
+    360979UL,  1021217UL, 859677UL,  523811UL,  96433UL,   1045138UL, 1322376UL, 1345UL,    280685UL,  1024866UL,
+    201324UL,  481956UL,  581951UL,  1578240UL, 1167751UL, 1200264UL, 1053939UL, 335207UL,  1266120UL, 5178UL,
+    289331UL,  2044078UL, 595233UL,  1131824UL, 261707UL,  1845544UL, 524853UL,  109438UL,  1535346UL, 160857UL,
+    764619UL,  1706268UL, 2079573UL, 1050032UL, 1486843UL, 636311UL,  1415513UL, 1822616UL, 169483UL,  413708UL,
+    1125251UL, 671002UL,  1968556UL, 1733346UL, 851319UL,  575584UL,  1309578UL, 1320833UL, 57555UL,   545792UL,
+    755474UL,  2092106UL, 665499UL,  1631384UL, 1559066UL, 151976UL,  2041523UL, 1581654UL, 2090376UL, 1991321UL,
+    2052307UL, 479565UL,  1174487UL, 1431018UL, 1725067UL, 559495UL,  535136UL,  11730UL,   1333511UL, 1313803UL,
+    557585UL,  930175UL,  571964UL,  1525581UL, 351511UL,  426032UL,  1512758UL, 964540UL,  960333UL,  319418UL,
+    2010604UL, 1222476UL, 1131865UL, 1948497UL, 1547788UL, 1034105UL, 106053UL,  745155UL,  1709202UL, 1060613UL,
+    1322734UL, 401122UL,  389211UL,  1293873UL, 233828UL,  1396655UL, 1874064UL, 159718UL,  801635UL,  258345UL,
+    1983426UL, 346178UL,  960713UL,  1895446UL, 239313UL,  1018486UL, 206639UL,  1765208UL, 772526UL,  1966249UL,
+    1183902UL, 1531184UL, 1364469UL, 593763UL,  808840UL,  846338UL,  163179UL,  930505UL,  1910485UL, 1274321UL,
+    899147UL,  247085UL,  148304UL,  1709798UL, 127760UL,  471890UL,  1726927UL, 1701929UL, 319571UL,  2060618UL,
+    1509388UL, 2039027UL, 1798677UL, 1451289UL, 472794UL,  895067UL,  2040725UL, 1256397UL, 1099746UL, 1190542UL,
+    1811618UL, 1193508UL, 1277589UL, 1242008UL, 621326UL,  1397177UL, 1622211UL, 441281UL,  463332UL,  360027UL,
+    1207323UL, 229583UL,  27551UL,   369121UL,  1480174UL, 1758508UL, 862348UL,  1084269UL, 1243780UL, 1670726UL,
+    113915UL,  1238094UL, 2072807UL, 747204UL,  1720638UL, 1209747UL, 641725UL,  1715839UL, 1735000UL, 1320450UL,
+    368769UL,  557370UL,  1898083UL, 1044537UL, 1327635UL, 1898586UL, 1185343UL, 1395967UL, 1352815UL, 1890136UL,
+    1030193UL, 827554UL,  522641UL,  1824939UL, 393581UL,  131704UL,  270339UL,  550350UL,  32315UL,   1238884UL,
+    1128974UL, 1456711UL, 190429UL,  1197071UL, 1348662UL, 769566UL,  1614342UL, 1514834UL, 668552UL,  676775UL,
+    627830UL,  880053UL,  239147UL,  37597UL,
+};
+uint32_t rand_arr_22_b22_w32_arr[1024] = {
+    151274UL,  4108417UL, 3970972UL, 213497UL,  3806418UL, 3598865UL, 2597567UL, 425568UL,  126286UL,  3269295UL,
+    672071UL,  3578363UL, 527994UL,  4037487UL, 3278044UL, 2415897UL, 3887917UL, 3248790UL, 589369UL,  3480759UL,
+    3348562UL, 3911414UL, 2014867UL, 535441UL,  3603734UL, 1196505UL, 1637535UL, 1952395UL, 3216579UL, 3748868UL,
+    28216UL,   2368266UL, 1736620UL, 2573563UL, 1665848UL, 1965389UL, 780206UL,  3961886UL, 2455087UL, 3387402UL,
+    1417015UL, 670190UL,  2559450UL, 2918991UL, 2824718UL, 225877UL,  3091595UL, 3426606UL, 130164UL,  805470UL,
+    1941386UL, 2401796UL, 3009771UL, 1701225UL, 4124043UL, 954880UL,  2386816UL, 3722091UL, 3453833UL, 1870538UL,
+    1095383UL, 1371465UL, 3324850UL, 173128UL,  67889UL,   2978925UL, 3393288UL, 567845UL,  2425793UL, 3837622UL,
+    2873419UL, 1029246UL, 3216668UL, 688357UL,  707680UL,  2779763UL, 639340UL,  4054262UL, 2725401UL, 722846UL,
+    2470938UL, 473334UL,  1920662UL, 2638774UL, 2972997UL, 2869768UL, 1075485UL, 2977507UL, 803304UL,  3211414UL,
+    1617808UL, 2185853UL, 592469UL,  3367895UL, 1119161UL, 1409569UL, 887852UL,  2538078UL, 4069674UL, 1871220UL,
+    2704672UL, 3620619UL, 785604UL,  3844446UL, 1919596UL, 484023UL,  3028971UL, 3437824UL, 2799817UL, 1690550UL,
+    1290192UL, 3696830UL, 2751877UL, 569031UL,  2663946UL, 3623933UL, 3551483UL, 2493602UL, 3645204UL, 4062061UL,
+    1007441UL, 3755770UL, 1760633UL, 690632UL,  3559147UL, 2255113UL, 16786UL,   1676326UL, 3917743UL, 3531270UL,
+    281744UL,  2975841UL, 1200227UL, 2216657UL, 3304866UL, 2521105UL, 2946624UL, 368256UL,  1013825UL, 99596UL,
+    1367565UL, 1538396UL, 1359210UL, 7382UL,    3927416UL, 3326666UL, 3145700UL, 619820UL,  2853949UL, 2714621UL,
+    3458752UL, 2386819UL, 516164UL,  1609247UL, 890461UL,  1173306UL, 1789517UL, 3431552UL, 3595088UL, 2584762UL,
+    1387741UL, 211703UL,  3238774UL, 2993452UL, 1478114UL, 819076UL,  2867112UL, 2070353UL, 2873881UL, 702598UL,
+    3017363UL, 2609063UL, 1967310UL, 1169907UL, 1964444UL, 3316100UL, 2349365UL, 3125528UL, 2423644UL, 1146516UL,
+    1055470UL, 3809850UL, 1533588UL, 1815662UL, 3385319UL, 3717414UL, 1820309UL, 3747140UL, 1973541UL, 569139UL,
+    1695004UL, 2092542UL, 575692UL,  1440918UL, 987906UL,  677315UL,  451513UL,  3195573UL, 2690363UL, 3706740UL,
+    1092975UL, 3513517UL, 3699600UL, 2190977UL, 3637726UL, 3139154UL, 2633673UL, 356926UL,  2582689UL, 3734826UL,
+    2942801UL, 3473823UL, 714464UL,  4033629UL, 1540907UL, 1904010UL, 148728UL,  3595894UL, 3900686UL, 3245UL,
+    1426559UL, 3410226UL, 3066973UL, 637218UL,  2631692UL, 929002UL,  1724972UL, 1487837UL, 1182755UL, 3405384UL,
+    3145117UL, 3980986UL, 681863UL,  785506UL,  3942255UL, 1764437UL, 160456UL,  1960902UL, 2270363UL, 2822512UL,
+    1026301UL, 2612667UL, 2057736UL, 2452828UL, 2339271UL, 2533326UL, 1514392UL, 2422887UL, 3720240UL, 2833834UL,
+    2304666UL, 4122088UL, 2760827UL, 2408053UL, 1154325UL, 366863UL,  1773707UL, 4077708UL, 2674110UL, 2470560UL,
+    643718UL,  2196495UL, 2089234UL, 3388802UL, 2159734UL, 2418309UL, 2537692UL, 253572UL,  229088UL,  2105148UL,
+    3623557UL, 2479759UL, 898550UL,  3758700UL, 3276764UL, 1587991UL, 3260619UL, 3057831UL, 1953087UL, 2957758UL,
+    3421979UL, 588888UL,  913307UL,  1750359UL, 2541943UL, 1862710UL, 1165990UL, 1352925UL, 2978759UL, 3168642UL,
+    535991UL,  2283668UL, 2276071UL, 4082873UL, 2086593UL, 1383450UL, 4009226UL, 1462362UL, 974160UL,  273687UL,
+    1550302UL, 2225057UL, 3590338UL, 3980192UL, 71034UL,   2874067UL, 1960503UL, 3778766UL, 1518052UL, 1437841UL,
+    1343105UL, 2338319UL, 3296215UL, 1869780UL, 2886443UL, 1582315UL, 2493124UL, 3652405UL, 1488286UL, 4071416UL,
+    284634UL,  1654864UL, 3171186UL, 358347UL,  156176UL,  433732UL,  675201UL,  1521439UL, 526462UL,  1224569UL,
+    2399507UL, 3499235UL, 291227UL,  3504418UL, 2137731UL, 1276486UL, 3723712UL, 2699859UL, 1524561UL, 1578072UL,
+    2795791UL, 125229UL,  3451068UL, 1003846UL, 1734494UL, 1511980UL, 1458739UL, 1619745UL, 2142119UL, 3929674UL,
+    83914UL,   3976648UL, 686049UL,  3881689UL, 2329224UL, 3848902UL, 4159774UL, 2111843UL, 3286521UL, 3169102UL,
+    1085010UL, 3336513UL, 2532164UL, 219882UL,  4035524UL, 2815046UL, 3162941UL, 1607942UL, 1258344UL, 3013191UL,
+    2818179UL, 2393530UL, 3278276UL, 3738071UL, 2443569UL, 498095UL,  3811628UL, 4068385UL, 1297371UL, 3656544UL,
+    2406636UL, 211101UL,  2599810UL, 314686UL,  810476UL,  3542764UL, 1543648UL, 1986252UL, 1365179UL, 2870310UL,
+    1305925UL, 393672UL,  2774147UL, 3170343UL, 3945928UL, 2601876UL, 788507UL,  3857781UL, 2962652UL, 2950595UL,
+    1718104UL, 2812798UL, 1509082UL, 2639847UL, 1777337UL, 196344UL,  2695440UL, 3324003UL, 2031863UL, 3481134UL,
+    855519UL,  3982632UL, 1107959UL, 2530386UL, 1239901UL, 2316102UL, 283256UL,  1411075UL, 1736134UL, 766769UL,
+    3271400UL, 54356UL,   208831UL,  2244800UL, 1078960UL, 2311822UL, 1527370UL, 2473962UL, 4178437UL, 2270532UL,
+    2727775UL, 2750831UL, 3287381UL, 1157535UL, 3430165UL, 2363871UL, 499291UL,  1161320UL, 1155844UL, 590301UL,
+    1058098UL, 502079UL,  475561UL,  763340UL,  2269835UL, 2633997UL, 1238391UL, 1677976UL, 1007549UL, 2804227UL,
+    174385UL,  3985327UL, 989680UL,  3378819UL, 590533UL,  4009007UL, 2208652UL, 185257UL,  273760UL,  2582388UL,
+    3506236UL, 3584004UL, 1412957UL, 143840UL,  1041943UL, 3560475UL, 3370724UL, 1013223UL, 3258690UL, 3942720UL,
+    1873271UL, 1029683UL, 1427276UL, 2981110UL, 3802163UL, 938551UL,  2631341UL, 102337UL,  1223063UL, 2018032UL,
+    2404496UL, 1860797UL, 2186638UL, 1712182UL, 2085183UL, 1900937UL, 3912489UL, 3202705UL, 3447049UL, 2165867UL,
+    1816758UL, 3415312UL, 2625434UL, 3715477UL, 3062731UL, 4007495UL, 592808UL,  1048791UL, 1959427UL, 822883UL,
+    1718919UL, 4112731UL, 657396UL,  2205864UL, 2218655UL, 3523406UL, 2446777UL, 1301368UL, 2345925UL, 1291426UL,
+    219430UL,  3527981UL, 3742408UL, 1444141UL, 402301UL,  1147185UL, 3549634UL, 2246968UL, 3361183UL, 3168186UL,
+    1445257UL, 902484UL,  2074829UL, 3487559UL, 1298709UL, 688925UL,  1419606UL, 1156396UL, 3059018UL, 1289724UL,
+    1664092UL, 2081907UL, 2354801UL, 135069UL,  432710UL,  3930449UL, 4038076UL, 3863132UL, 413936UL,  2328768UL,
+    1190540UL, 2011191UL, 3899666UL, 2849223UL, 3423980UL, 1552682UL, 2563069UL, 3501868UL, 1874884UL, 3372987UL,
+    3202091UL, 3864362UL, 2692844UL, 858770UL,  3049194UL, 1886740UL, 1057139UL, 786752UL,  276352UL,  2604744UL,
+    1023326UL, 2556025UL, 2966722UL, 1340107UL, 2355133UL, 1797913UL, 807825UL,  2985061UL, 751210UL,  1278319UL,
+    2869921UL, 1958058UL, 86477UL,   2936294UL, 191567UL,  619342UL,  2165747UL, 576255UL,  560949UL,  2317931UL,
+    2966333UL, 3293816UL, 1093196UL, 652411UL,  2040463UL, 2356116UL, 2484239UL, 2149979UL, 3331016UL, 864881UL,
+    1470255UL, 1925052UL, 1765580UL, 793201UL,  562569UL,  3329407UL, 3149281UL, 2509562UL, 159796UL,  3349314UL,
+    808010UL,  646495UL,  3440177UL, 2762478UL, 3815711UL, 2639976UL, 2853549UL, 1278556UL, 3087881UL, 3537001UL,
+    3404179UL, 1040983UL, 3676989UL, 400990UL,  1327220UL, 3407524UL, 10926UL,   129139UL,  492982UL,  2499057UL,
+    3115679UL, 936484UL,  2726611UL, 1051063UL, 1681135UL, 1423671UL, 137574UL,  3598233UL, 4124813UL, 2823931UL,
+    3796127UL, 1318509UL, 3429266UL, 866322UL,  299844UL,  1704655UL, 2025013UL, 809295UL,  977063UL,  1892262UL,
+    3375910UL, 2449957UL, 3031002UL, 3529357UL, 4059186UL, 2132680UL, 4169532UL, 1161749UL, 1370882UL, 2724699UL,
+    1410527UL, 3862199UL, 2296861UL, 173652UL,  1522415UL, 1912035UL, 3567092UL, 906172UL,  943615UL,  1272270UL,
+    1855705UL, 1744246UL, 2699444UL, 2914274UL, 3129995UL, 2397229UL, 2352972UL, 77117UL,   3570106UL, 493317UL,
+    1332883UL, 1908643UL, 1909125UL, 2237710UL, 1231499UL, 4168326UL, 2018246UL, 2661651UL, 1885539UL, 4004990UL,
+    1245376UL, 1795096UL, 678483UL,  2320025UL, 2831621UL, 1882004UL, 99827UL,   2900722UL, 2201778UL, 3528301UL,
+    981353UL,  570909UL,  2407504UL, 3608271UL, 3092377UL, 150322UL,  1914394UL, 1621391UL, 2962863UL, 1130908UL,
+    465701UL,  904177UL,  1191083UL, 909674UL,  2957004UL, 2545710UL, 728686UL,  3757985UL, 2952763UL, 1690785UL,
+    3794511UL, 2678027UL, 1999031UL, 3212653UL, 1329665UL, 3120378UL, 3935246UL, 3439491UL, 955404UL,  1629515UL,
+    881014UL,  3045894UL, 4149709UL, 2862334UL, 843476UL,  2169365UL, 1416845UL, 3085095UL, 2104031UL, 508659UL,
+    2320090UL, 593869UL,  1383375UL, 2426089UL, 3070508UL, 3691733UL, 2726874UL, 3486341UL, 2657430UL, 3558873UL,
+    3072687UL, 1492804UL, 1154812UL, 2453575UL, 435184UL,  2335664UL, 2972783UL, 3313006UL, 2324937UL, 2097725UL,
+    1011796UL, 3066313UL, 1541183UL, 1925641UL, 2579675UL, 3065259UL, 3600278UL, 142660UL,  1982942UL, 3359955UL,
+    2770272UL, 2653372UL, 428647UL,  3739209UL, 3114412UL, 3183774UL, 1585201UL, 325188UL,  354268UL,  81111UL,
+    2894201UL, 418359UL,  1580262UL, 4164572UL, 2645407UL, 4150020UL, 3323808UL, 691100UL,  3861952UL, 2061660UL,
+    3007279UL, 2158664UL, 1141874UL, 3809976UL, 60072UL,   8712UL,    2610815UL, 657115UL,  1125855UL, 2044379UL,
+    3789892UL, 1933254UL, 4017948UL, 1923123UL, 3929234UL, 3329857UL, 3778737UL, 2046553UL, 2937406UL, 1242718UL,
+    3391737UL, 2740404UL, 1158730UL, 3505749UL, 3265704UL, 1255633UL, 948978UL,  741105UL,  3508407UL, 3178132UL,
+    769604UL,  1256858UL, 4180145UL, 1112753UL, 1340689UL, 2230567UL, 895212UL,  2855189UL, 2110371UL, 3484748UL,
+    489413UL,  4074928UL, 1395283UL, 4174966UL, 3740175UL, 3020252UL, 3152902UL, 1836498UL, 2763070UL, 3689546UL,
+    3346982UL, 4107785UL, 2466738UL, 737835UL,  2736825UL, 207617UL,  1355221UL, 1931136UL, 1898025UL, 3976622UL,
+    648423UL,  1741744UL, 323605UL,  3191724UL, 546446UL,  3773091UL, 887516UL,  2091505UL, 3495841UL, 3099388UL,
+    2517259UL, 521269UL,  1578590UL, 57988UL,   611269UL,  3418200UL, 3544893UL, 2338344UL, 1575015UL, 2825640UL,
+    3962858UL, 934777UL,  610036UL,  2581301UL, 2906968UL, 3048754UL, 224858UL,  2759097UL, 3993014UL, 797418UL,
+    556326UL,  1299116UL, 1564952UL, 1968476UL, 3984587UL, 963944UL,  3022121UL, 1350477UL, 2802268UL, 295331UL,
+    349061UL,  940681UL,  2869943UL, 3392276UL, 1144457UL, 2615697UL, 1505590UL, 2803136UL, 3896785UL, 1502817UL,
+    3542291UL, 1464556UL, 2329273UL, 3949538UL, 3305136UL, 258792UL,  3983958UL, 3772730UL, 672368UL,  3005912UL,
+    3199320UL, 2246819UL, 2146120UL, 1486176UL, 894128UL,  724911UL,  2596574UL, 3063528UL, 1954899UL, 3410881UL,
+    869050UL,  4163590UL, 2443890UL, 4110192UL, 3542947UL, 1551688UL, 4174165UL, 3326059UL, 1412478UL, 3049528UL,
+    3670926UL, 2709425UL, 1257371UL, 3110092UL, 739517UL,  3624619UL, 1756964UL, 83480UL,   5824UL,    1693171UL,
+    358304UL,  3313811UL, 2524027UL, 3095698UL, 974688UL,  3725900UL, 136237UL,  2185573UL, 2272382UL, 2262053UL,
+    1098720UL, 83121UL,   4181050UL, 3748808UL, 1399419UL, 3818699UL, 2577463UL, 1019257UL, 3879470UL, 770248UL,
+    127933UL,  2524593UL, 3975938UL, 1730430UL, 1930600UL, 429470UL,  2539007UL, 2476894UL, 1260397UL, 2296052UL,
+    620153UL,  1406920UL, 887072UL,  1865149UL, 1289456UL, 2608934UL, 1213396UL, 3550113UL, 2382657UL, 3915302UL,
+    2734678UL, 1990996UL, 1961272UL, 3225850UL, 2030856UL, 3087106UL, 393505UL,  327080UL,  2735029UL, 2173345UL,
+    1490028UL, 391293UL,  1909458UL, 2621660UL, 508919UL,  4030518UL, 786490UL,  440079UL,  1352482UL, 194223UL,
+    113261UL,  3938565UL, 2852427UL, 2689087UL, 4177824UL, 232552UL,  1382187UL, 3189337UL, 2969405UL, 906731UL,
+    3812081UL, 1339481UL, 173309UL,  3452721UL, 1316711UL, 2870730UL, 2699947UL, 324611UL,  3743546UL, 3913760UL,
+    1112600UL, 2973079UL, 3304889UL, 1841470UL, 1365381UL, 3840355UL, 1007382UL, 2608778UL, 918849UL,  1703411UL,
+    3957585UL, 1437834UL, 1205422UL, 440018UL,
+};
+uint32_t rand_arr_23_b23_w32_arr[1024] = {
+    2561892UL, 1045804UL, 6098096UL, 8335633UL, 2218593UL, 3743205UL, 1655120UL, 5162250UL, 809507UL,  6497291UL,
+    5304672UL, 3625153UL, 5844750UL, 3539UL,    7466143UL, 1381443UL, 550612UL,  1574953UL, 8310018UL, 3677251UL,
+    5645247UL, 7809986UL, 1075808UL, 5938275UL, 5445411UL, 2654218UL, 2295353UL, 2402383UL, 1284838UL, 6849358UL,
+    3710848UL, 386877UL,  3577724UL, 7050296UL, 2396924UL, 2614826UL, 6624829UL, 6685940UL, 6668559UL, 555426UL,
+    5059656UL, 85846UL,   6968059UL, 6074951UL, 1098422UL, 5776095UL, 1561870UL, 6151913UL, 7182908UL, 6652473UL,
+    376598UL,  7900873UL, 2365221UL, 3124569UL, 6833909UL, 6850251UL, 769385UL,  7022815UL, 6322205UL, 3258077UL,
+    2270123UL, 6199205UL, 3610604UL, 1659008UL, 6052697UL, 1078008UL, 7711851UL, 516029UL,  2779513UL, 464936UL,
+    329066UL,  7765338UL, 4935048UL, 5767449UL, 3572229UL, 4581963UL, 699018UL,  5125827UL, 5165865UL, 2284779UL,
+    4125935UL, 4251584UL, 3496999UL, 3789486UL, 1644657UL, 4072949UL, 203418UL,  5843783UL, 1761387UL, 6798970UL,
+    4976977UL, 1059878UL, 2966944UL, 904550UL,  2234298UL, 1590781UL, 5543283UL, 7097501UL, 4463353UL, 3753608UL,
+    4816802UL, 8074567UL, 390059UL,  3412250UL, 735476UL,  6303457UL, 5632238UL, 4522441UL, 2043790UL, 1656103UL,
+    4427874UL, 148969UL,  3833710UL, 7380116UL, 1808777UL, 5109269UL, 8171477UL, 6890476UL, 4130805UL, 4095041UL,
+    6916037UL, 977670UL,  5547549UL, 6174571UL, 3132551UL, 2410328UL, 1773009UL, 3541050UL, 8124586UL, 4909981UL,
+    7448323UL, 5114941UL, 7950055UL, 2369370UL, 6800214UL, 6882232UL, 3094123UL, 4931221UL, 2455777UL, 407342UL,
+    3506421UL, 1377452UL, 2222680UL, 5263780UL, 5578707UL, 5887773UL, 8314178UL, 5494821UL, 1377176UL, 7013951UL,
+    3346995UL, 2230147UL, 1988888UL, 4068146UL, 1663735UL, 4838123UL, 791384UL,  1564699UL, 5209589UL, 2020866UL,
+    7116994UL, 5648892UL, 2622483UL, 2439349UL, 1856583UL, 2046544UL, 4283966UL, 450762UL,  5674552UL, 5370239UL,
+    2606041UL, 8001916UL, 5882657UL, 3228209UL, 3034762UL, 4686371UL, 4559572UL, 529216UL,  5950440UL, 2311278UL,
+    7350200UL, 6995699UL, 6972028UL, 2476453UL, 5395918UL, 6947982UL, 7633663UL, 2653519UL, 1506001UL, 2122818UL,
+    7477606UL, 4915305UL, 2599801UL, 767195UL,  4476009UL, 5414923UL, 5812115UL, 7678818UL, 3831745UL, 3691692UL,
+    4808363UL, 4182242UL, 1380079UL, 6933103UL, 7012693UL, 1699766UL, 737290UL,  4616705UL, 3624407UL, 2154458UL,
+    6825539UL, 6708120UL, 6461159UL, 4979493UL, 2517081UL, 641925UL,  7587741UL, 2079312UL, 7503285UL, 5676912UL,
+    8060555UL, 8295691UL, 3993661UL, 5009814UL, 4489293UL, 2830951UL, 1400889UL, 4617476UL, 254614UL,  7729113UL,
+    1856647UL, 4794130UL, 1630260UL, 1277455UL, 3684186UL, 478973UL,  2552270UL, 6044819UL, 5116470UL, 935539UL,
+    683519UL,  8033108UL, 2569425UL, 7320140UL, 128201UL,  6065729UL, 3888432UL, 2484030UL, 605236UL,  1878147UL,
+    3555045UL, 5625583UL, 403675UL,  6650346UL, 827737UL,  3448634UL, 5127540UL, 783232UL,  4924550UL, 8129172UL,
+    8007646UL, 6712154UL, 5082651UL, 4961462UL, 3070350UL, 4247964UL, 2431296UL, 5539575UL, 6127207UL, 1235837UL,
+    2471603UL, 4626997UL, 2742563UL, 1220962UL, 7112952UL, 3618279UL, 4746345UL, 7224598UL, 1938065UL, 6951562UL,
+    7028150UL, 8064016UL, 7437608UL, 1945027UL, 7426240UL, 2563809UL, 384851UL,  7337602UL, 7138135UL, 3717721UL,
+    2145515UL, 1078669UL, 6824252UL, 8004920UL, 2180298UL, 5946426UL, 560198UL,  6249696UL, 6883091UL, 1265158UL,
+    8134132UL, 5436673UL, 2865587UL, 5522942UL, 4466876UL, 7718773UL, 4283634UL, 4673643UL, 6322911UL, 4868515UL,
+    5674792UL, 436116UL,  4204161UL, 6075227UL, 1263038UL, 1285010UL, 816028UL,  672974UL,  3148815UL, 2699842UL,
+    6876734UL, 1078952UL, 6378213UL, 1151159UL, 3941368UL, 3133145UL, 4864606UL, 5753612UL, 3774013UL, 849135UL,
+    7230335UL, 6988116UL, 2197839UL, 559994UL,  6761522UL, 6758816UL, 6385393UL, 5474176UL, 2908820UL, 2478063UL,
+    4037174UL, 2343783UL, 6121007UL, 1385830UL, 4226130UL, 1881813UL, 3386314UL, 6267955UL, 6243077UL, 4087640UL,
+    5522661UL, 1711224UL, 1363736UL, 7008097UL, 5298259UL, 6522867UL, 1057085UL, 5470628UL, 4492254UL, 300582UL,
+    3159621UL, 7762056UL, 415116UL,  4549326UL, 3720095UL, 2331181UL, 7913106UL, 3599486UL, 968543UL,  3978575UL,
+    7284393UL, 7757193UL, 273869UL,  2785582UL, 4143645UL, 5331603UL, 1255767UL, 4919935UL, 2984997UL, 671389UL,
+    1514682UL, 5622394UL, 7484871UL, 8354836UL, 2365287UL, 6275453UL, 4045533UL, 370155UL,  2876451UL, 16895UL,
+    5635238UL, 5635372UL, 2954534UL, 7006868UL, 6459198UL, 6801681UL, 7571293UL, 853502UL,  7902170UL, 5979684UL,
+    7711310UL, 2702373UL, 4890386UL, 7375457UL, 6840619UL, 111329UL,  3915005UL, 6150705UL, 2582302UL, 4820708UL,
+    5551933UL, 7454490UL, 2686795UL, 3234330UL, 1023145UL, 3469046UL, 744029UL,  4213227UL, 7187075UL, 3179528UL,
+    6012850UL, 7238233UL, 7045738UL, 766868UL,  4989125UL, 953238UL,  6369249UL, 1071913UL, 8333330UL, 4761942UL,
+    6384083UL, 357867UL,  8104335UL, 5681858UL, 2448154UL, 7510708UL, 5729728UL, 726535UL,  4604873UL, 377153UL,
+    1954910UL, 1703844UL, 6862475UL, 1907276UL, 7488208UL, 4403926UL, 6752150UL, 394356UL,  179799UL,  2565125UL,
+    2335149UL, 7348234UL, 1496982UL, 5559400UL, 2676078UL, 1106876UL, 6586895UL, 6053078UL, 8089368UL, 8178046UL,
+    4081974UL, 1857897UL, 5064193UL, 2147912UL, 3608884UL, 8081235UL, 5699366UL, 5446402UL, 1736487UL, 4498987UL,
+    1773167UL, 3948497UL, 3190847UL, 5725005UL, 1660276UL, 6932432UL, 2817036UL, 5847778UL, 3820434UL, 2433481UL,
+    3478272UL, 2283067UL, 2871975UL, 759640UL,  6357456UL, 328022UL,  2388896UL, 6513243UL, 4645576UL, 6206993UL,
+    7275630UL, 4229932UL, 2401686UL, 7986365UL, 876005UL,  5217848UL, 6732058UL, 4970304UL, 7080038UL, 744349UL,
+    2695419UL, 7010933UL, 7735868UL, 1861803UL, 3973461UL, 230581UL,  1735519UL, 3402220UL, 5480480UL, 3769939UL,
+    726467UL,  5302385UL, 5976073UL, 6187689UL, 1429329UL, 3879488UL, 3728138UL, 3224194UL, 1174180UL, 2663902UL,
+    6987695UL, 5556622UL, 6839433UL, 2877115UL, 7536637UL, 7333260UL, 6116315UL, 7120082UL, 4740194UL, 4605901UL,
+    723062UL,  1211327UL, 1203259UL, 2926090UL, 6453470UL, 4408420UL, 1658800UL, 2113151UL, 7836939UL, 7758896UL,
+    3677055UL, 4817173UL, 375949UL,  963264UL,  4858857UL, 1552994UL, 7388702UL, 5968720UL, 653658UL,  5502440UL,
+    866416UL,  5572683UL, 5481276UL, 7772640UL, 8168201UL, 3887297UL, 4374079UL, 2455102UL, 2281906UL, 4289570UL,
+    7825254UL, 6728207UL, 4266941UL, 4103838UL, 541455UL,  7059965UL, 4330096UL, 4950159UL, 2905086UL, 8351457UL,
+    7020650UL, 3950524UL, 1974485UL, 1332706UL, 3561060UL, 4600153UL, 1268832UL, 1295645UL, 2400940UL, 2226534UL,
+    5199070UL, 3736626UL, 2000212UL, 2609312UL, 1606783UL, 3736532UL, 6977389UL, 8298183UL, 7165837UL, 2950309UL,
+    1990736UL, 5775888UL, 1374185UL, 8226244UL, 4080792UL, 532492UL,  7198426UL, 4983263UL, 4166067UL, 3540854UL,
+    2819604UL, 1327714UL, 6295153UL, 6745342UL, 3623516UL, 3719869UL, 5421303UL, 1860842UL, 4773847UL, 6515013UL,
+    3076520UL, 2343341UL, 7241434UL, 864591UL,  2598066UL, 2718972UL, 8136451UL, 4859777UL, 3158444UL, 3993966UL,
+    6213534UL, 7001982UL, 1799599UL, 7909007UL, 4509989UL, 2225009UL, 4563572UL, 7240700UL, 166176UL,  6201682UL,
+    923870UL,  1615327UL, 1847037UL, 548023UL,  4583361UL, 2309259UL, 1018312UL, 7079990UL, 2419650UL, 3030269UL,
+    2661605UL, 6336770UL, 6809944UL, 1342179UL, 4291326UL, 6013378UL, 5700531UL, 3248251UL, 673712UL,  6233081UL,
+    6456799UL, 599675UL,  4498206UL, 1833200UL, 8148778UL, 7535707UL, 4569648UL, 943190UL,  7018073UL, 8183922UL,
+    2846819UL, 6497248UL, 4244176UL, 3267436UL, 6233051UL, 2613769UL, 5415104UL, 7325795UL, 6858034UL, 7622643UL,
+    3475327UL, 117013UL,  2743802UL, 5397222UL, 1859213UL, 8287936UL, 1977910UL, 4216494UL, 803825UL,  2481559UL,
+    503878UL,  6978780UL, 5441274UL, 17453UL,   5282898UL, 6557413UL, 2645662UL, 6826288UL, 3573825UL, 539019UL,
+    7436346UL, 898247UL,  5210082UL, 6129941UL, 8177280UL, 8022688UL, 978722UL,  4640752UL, 336284UL,  298512UL,
+    7310667UL, 6321749UL, 1480628UL, 6115747UL, 1132098UL, 1919394UL, 7251447UL, 7989734UL, 6014750UL, 2885817UL,
+    850155UL,  1571421UL, 698638UL,  4826714UL, 4342994UL, 731178UL,  3441957UL, 419585UL,  3647030UL, 3963719UL,
+    2364591UL, 974487UL,  2503897UL, 6914953UL, 4261852UL, 2625982UL, 6950578UL, 3711396UL, 5057636UL, 2459067UL,
+    617778UL,  6593387UL, 3229699UL, 5086163UL, 549606UL,  7957282UL, 2267971UL, 3216562UL, 583576UL,  5601935UL,
+    2113811UL, 3544578UL, 6101338UL, 380784UL,  2819244UL, 2287310UL, 709098UL,  7295582UL, 5087224UL, 4552357UL,
+    2131020UL, 981474UL,  3507875UL, 1652275UL, 3167926UL, 867955UL,  6319887UL, 6895359UL, 521657UL,  172225UL,
+    7962542UL, 5018957UL, 3618370UL, 5806068UL, 3823579UL, 1871699UL, 6907163UL, 4885218UL, 825475UL,  2645696UL,
+    49759UL,   5155940UL, 3535248UL, 849886UL,  3867001UL, 6429277UL, 1628160UL, 1056693UL, 19212UL,   2273149UL,
+    4919186UL, 6300427UL, 4534892UL, 8169216UL, 1342242UL, 1783524UL, 960025UL,  3103738UL, 3982278UL, 6464950UL,
+    1306UL,    2497288UL, 1291618UL, 1851485UL, 1547259UL, 7239011UL, 8120396UL, 6902354UL, 7372642UL, 7455414UL,
+    7829895UL, 1422801UL, 3842797UL, 3196746UL, 4237539UL, 2620263UL, 560092UL,  6015810UL, 5389776UL, 7797087UL,
+    2976508UL, 6266834UL, 1407508UL, 5862016UL, 1409636UL, 3744740UL, 7541172UL, 384582UL,  3204877UL, 2786006UL,
+    7966344UL, 6634209UL, 2753273UL, 6521435UL, 4678679UL, 4300401UL, 4691476UL, 4366021UL, 2549636UL, 4955891UL,
+    1462096UL, 4466853UL, 7707831UL, 3839857UL, 47521UL,   4696861UL, 5691771UL, 1035696UL, 5500234UL, 2230513UL,
+    7826392UL, 2255583UL, 6594839UL, 1120936UL, 4146012UL, 3952392UL, 3660146UL, 4596362UL, 8280013UL, 7806462UL,
+    2369124UL, 4977211UL, 6271574UL, 7576605UL, 4920931UL, 445856UL,  4112781UL, 3920119UL, 1499232UL, 7135958UL,
+    4235325UL, 316251UL,  3498184UL, 4174798UL, 4297658UL, 4962517UL, 3372249UL, 7535777UL, 3279449UL, 6939950UL,
+    3457754UL, 5761484UL, 2326582UL, 8225197UL, 807211UL,  4972497UL, 1733923UL, 4222520UL, 686472UL,  4006223UL,
+    941954UL,  8257968UL, 159496UL,  219127UL,  5756961UL, 6526593UL, 1124159UL, 373320UL,  3364176UL, 27536UL,
+    3274185UL, 6947926UL, 4047133UL, 8318368UL, 3285214UL, 7689457UL, 8013391UL, 4839703UL, 5068447UL, 7585457UL,
+    7971780UL, 314790UL,  7036291UL, 7858514UL, 256775UL,  6545792UL, 4618981UL, 6817541UL, 1437113UL, 3681095UL,
+    802952UL,  8342799UL, 156415UL,  4080934UL, 3533273UL, 6464535UL, 1790480UL, 2928532UL, 3360252UL, 5900019UL,
+    4919891UL, 340374UL,  6344577UL, 4585529UL, 6718380UL, 1605133UL, 4142495UL, 3913378UL, 5381819UL, 915125UL,
+    5866185UL, 7327286UL, 1245685UL, 6106077UL, 2489115UL, 4557619UL, 1660900UL, 8185686UL, 719911UL,  5734952UL,
+    3562691UL, 8143958UL, 8252503UL, 3443275UL, 5960686UL, 6740242UL, 6176136UL, 2820395UL, 5590076UL, 3321564UL,
+    7360889UL, 5841303UL, 762780UL,  2856824UL, 3819389UL, 7325225UL, 3365374UL, 8071935UL, 5729697UL, 2505406UL,
+    4948232UL, 3948590UL, 1256105UL, 3985108UL, 5940267UL, 790861UL,  4611546UL, 2151840UL, 4010671UL, 5691981UL,
+    7058047UL, 1745113UL, 672521UL,  1276860UL, 2128748UL, 4569066UL, 5723400UL, 5210686UL, 1869767UL, 3479029UL,
+    1482664UL, 5499609UL, 3020718UL, 7629360UL, 2633485UL, 4977958UL, 6402110UL, 2207520UL, 3529559UL, 3512700UL,
+    3473586UL, 2297493UL, 919915UL,  6226177UL, 5168537UL, 6455158UL, 2870250UL, 3753800UL, 3648139UL, 4173729UL,
+    4496564UL, 3080493UL, 5321918UL, 6502057UL, 4041409UL, 5332539UL, 1428533UL, 1742579UL, 4960056UL, 4187533UL,
+    352399UL,  3594964UL, 1380442UL, 7428841UL, 677757UL,  3246183UL, 645057UL,  3380113UL, 7953991UL, 6956126UL,
+    341905UL,  7985762UL, 3194757UL, 6259546UL,
+};
+uint32_t rand_arr_24_b24_w32_arr[1024] = {
+    2336682UL,  6876651UL,  3886159UL,  9606588UL,  1954858UL,  5141439UL,  3398718UL,  4756643UL,  12260205UL,
+    13338162UL, 2263011UL,  1613382UL,  12686990UL, 4584437UL,  3315996UL,  1055404UL,  6240645UL,  11261464UL,
+    10607764UL, 2435044UL,  6657016UL,  4479335UL,  3800020UL,  6839284UL,  4754171UL,  12413361UL, 619316UL,
+    8031747UL,  9466550UL,  5165495UL,  9859041UL,  2085600UL,  13053596UL, 6680129UL,  11484495UL, 14054485UL,
+    14722143UL, 12344921UL, 7451837UL,  1068753UL,  3829522UL,  6070186UL,  2878814UL,  2264639UL,  16172053UL,
+    882009UL,   7692037UL,  7838029UL,  13547569UL, 16516744UL, 6072779UL,  14302965UL, 11306409UL, 4583526UL,
+    8037264UL,  2832963UL,  2212727UL,  8243698UL,  8578630UL,  15282758UL, 10868009UL, 15853564UL, 584765UL,
+    287378UL,   16364626UL, 14438746UL, 15639194UL, 4395765UL,  1210668UL,  10400673UL, 11390590UL, 3993079UL,
+    14721020UL, 13616643UL, 13613399UL, 3864437UL,  11275967UL, 6142999UL,  5522231UL,  2788752UL,  5266987UL,
+    8438089UL,  6295774UL,  12819722UL, 6437528UL,  336092UL,   1244609UL,  16476862UL, 11157235UL, 739632UL,
+    8760066UL,  15664893UL, 570725UL,   56240UL,    12402808UL, 83387UL,    7106198UL,  14248662UL, 8440767UL,
+    6465582UL,  2118260UL,  8548844UL,  11856231UL, 8131468UL,  997581UL,   5053517UL,  16072095UL, 1123631UL,
+    9132595UL,  1846578UL,  1138687UL,  12624898UL, 5718908UL,  9264116UL,  3287427UL,  6309715UL,  299717UL,
+    5201015UL,  6972364UL,  3922151UL,  7788628UL,  8148124UL,  12381736UL, 10286224UL, 14132861UL, 5558655UL,
+    13813356UL, 15394252UL, 10330916UL, 6636656UL,  6921937UL,  8015698UL,  13310503UL, 10304067UL, 15552973UL,
+    10931797UL, 98286UL,    1487963UL,  1124669UL,  10997373UL, 10680473UL, 15261697UL, 13173078UL, 5512719UL,
+    6309368UL,  9261981UL,  10336665UL, 9881325UL,  7478807UL,  10487641UL, 5566100UL,  3587659UL,  13691124UL,
+    3605556UL,  3228853UL,  1379515UL,  14328076UL, 403944UL,   1317843UL,  8871047UL,  1891794UL,  3990855UL,
+    887741UL,   9246874UL,  4472861UL,  3250587UL,  11348781UL, 11828846UL, 11735890UL, 11808851UL, 9483042UL,
+    15079838UL, 7332044UL,  9962196UL,  8725014UL,  9060036UL,  6413932UL,  6185518UL,  9482061UL,  5956760UL,
+    7284305UL,  11150320UL, 7930030UL,  2032076UL,  266829UL,   6671929UL,  13173432UL, 7257791UL,  3893065UL,
+    9962285UL,  11180176UL, 9231155UL,  13466382UL, 11065606UL, 14654890UL, 7557036UL,  1122088UL,  15919277UL,
+    3891369UL,  13320244UL, 6393818UL,  2138517UL,  921109UL,   6076917UL,  12802622UL, 7599502UL,  16749917UL,
+    10215082UL, 1261078UL,  14737563UL, 1639037UL,  11769381UL, 8648589UL,  9520354UL,  195412UL,   9915194UL,
+    180665UL,   12030455UL, 3426498UL,  12889123UL, 15391913UL, 16108471UL, 9018947UL,  1192252UL,  5828279UL,
+    4319786UL,  10446307UL, 8600650UL,  7561477UL,  9604457UL,  4499576UL,  5502966UL,  9374739UL,  6179133UL,
+    12392336UL, 4182071UL,  13061844UL, 586770UL,   15118487UL, 9998266UL,  12999954UL, 16377106UL, 6402487UL,
+    8714988UL,  3280499UL,  3364535UL,  12941878UL, 9634098UL,  2557764UL,  7627484UL,  2865688UL,  5655887UL,
+    2944614UL,  1338847UL,  8303524UL,  5949597UL,  2473888UL,  3323446UL,  12460476UL, 5657261UL,  10913360UL,
+    3995298UL,  10089950UL, 12873194UL, 2622136UL,  9666304UL,  15159344UL, 5784347UL,  990772UL,   3789273UL,
+    13620336UL, 6105726UL,  6600788UL,  5410419UL,  9259792UL,  9395344UL,  8097935UL,  10549751UL, 13883044UL,
+    3119203UL,  6163791UL,  6039995UL,  15807841UL, 8849780UL,  13519543UL, 6874073UL,  13213520UL, 13348326UL,
+    9793301UL,  699987UL,   13251273UL, 4866790UL,  8803659UL,  13221253UL, 9922098UL,  6290847UL,  1961064UL,
+    4074710UL,  937910UL,   5954459UL,  13750801UL, 12426159UL, 16747389UL, 1478918UL,  10105526UL, 10179752UL,
+    12267397UL, 6189344UL,  3200475UL,  3345074UL,  14727311UL, 5569120UL,  1440685UL,  7763703UL,  10993920UL,
+    14927928UL, 5854637UL,  5522548UL,  16255597UL, 3133159UL,  12508370UL, 8972857UL,  5197666UL,  4274561UL,
+    12508254UL, 13374432UL, 66316UL,    15443840UL, 15141202UL, 8233968UL,  11688324UL, 8306774UL,  3867588UL,
+    4645538UL,  3822245UL,  12233684UL, 16201363UL, 2524844UL,  11256558UL, 5089825UL,  12065491UL, 5964816UL,
+    2994841UL,  7396430UL,  12629190UL, 10178116UL, 3937138UL,  6240344UL,  112594UL,   5437386UL,  15229625UL,
+    14191620UL, 16267161UL, 4345358UL,  3881216UL,  3900843UL,  16277754UL, 13814310UL, 9895633UL,  15674138UL,
+    3109976UL,  2140740UL,  12108208UL, 7443206UL,  12987315UL, 15970617UL, 7436725UL,  11987628UL, 14333918UL,
+    3945452UL,  7162759UL,  12055969UL, 11308457UL, 6986847UL,  13223631UL, 3562397UL,  2720974UL,  1145516UL,
+    9479939UL,  1540336UL,  529824UL,   1655175UL,  8890985UL,  14559274UL, 5116724UL,  7900331UL,  14546736UL,
+    14236882UL, 3079841UL,  5293072UL,  3131327UL,  5639798UL,  8394505UL,  204444UL,   549281UL,   4580880UL,
+    2197702UL,  8041935UL,  5963938UL,  10596257UL, 6388855UL,  10496869UL, 16384950UL, 13815911UL, 4956324UL,
+    2511219UL,  5857100UL,  1301719UL,  8462984UL,  4601934UL,  15412549UL, 11671561UL, 7339701UL,  14523857UL,
+    6626358UL,  6499335UL,  9694036UL,  11570918UL, 13394847UL, 2787980UL,  3343636UL,  15001158UL, 14109283UL,
+    4043733UL,  6123191UL,  12062212UL, 2682272UL,  14525024UL, 11706852UL, 3740824UL,  16541848UL, 9894605UL,
+    6292021UL,  11657690UL, 11492683UL, 9004135UL,  9346154UL,  7538804UL,  9840745UL,  2981746UL,  1141837UL,
+    8763706UL,  824620UL,   3287011UL,  6105575UL,  15015925UL, 5887679UL,  5532237UL,  10620680UL, 15109665UL,
+    13280003UL, 11955373UL, 16636088UL, 5948687UL,  5997421UL,  8554254UL,  8408285UL,  1103020UL,  823744UL,
+    8458369UL,  12984132UL, 4888512UL,  14433363UL, 7579579UL,  8720383UL,  16240407UL, 16313255UL, 2293835UL,
+    6201090UL,  7062187UL,  12912057UL, 9027910UL,  7750233UL,  5657990UL,  7354001UL,  11085542UL, 13822744UL,
+    2522033UL,  3227278UL,  2334273UL,  15730136UL, 13967610UL, 3716779UL,  3269124UL,  11002482UL, 8398145UL,
+    1921895UL,  412689UL,   6490112UL,  7359892UL,  193610UL,   2785347UL,  11069953UL, 14804798UL, 4478081UL,
+    3643095UL,  13671303UL, 1528394UL,  15125217UL, 6635591UL,  16122420UL, 2309395UL,  13218855UL, 1734261UL,
+    13771499UL, 7657256UL,  110575UL,   6675481UL,  9902080UL,  15275237UL, 7440732UL,  11255212UL, 11459633UL,
+    4510186UL,  8341440UL,  15215464UL, 6859740UL,  558672UL,   8439224UL,  10218660UL, 11282266UL, 3639103UL,
+    14706152UL, 8931104UL,  8252760UL,  11699000UL, 250052UL,   13788479UL, 8296580UL,  6868998UL,  15900554UL,
+    11546017UL, 15445511UL, 2820303UL,  9161883UL,  2471305UL,  11500988UL, 12180508UL, 7443849UL,  16408520UL,
+    11384173UL, 4715500UL,  15267753UL, 5255014UL,  14595748UL, 3715271UL,  14841532UL, 14401955UL, 8482951UL,
+    16110609UL, 11916UL,    5585678UL,  7150778UL,  6951633UL,  3225821UL,  659007UL,   9511842UL,  6745876UL,
+    3402171UL,  4990854UL,  9862474UL,  9831027UL,  2083722UL,  6138325UL,  15968834UL, 1448543UL,  7382679UL,
+    7233598UL,  5534947UL,  8559739UL,  2037715UL,  10852648UL, 3725473UL,  5755669UL,  9776053UL,  16348564UL,
+    11505940UL, 3606850UL,  16263652UL, 10553111UL, 2500919UL,  10054169UL, 7877498UL,  13380420UL, 7684830UL,
+    2397054UL,  12290795UL, 3665133UL,  9275935UL,  8357443UL,  10036521UL, 717544UL,   11733782UL, 11071392UL,
+    14323041UL, 12255890UL, 7343472UL,  12707391UL, 8274645UL,  6249041UL,  14480836UL, 6231493UL,  6374464UL,
+    8153153UL,  9195413UL,  15670881UL, 14253327UL, 16055409UL, 457671UL,   9537762UL,  277792UL,   12547106UL,
+    13319914UL, 10806405UL, 304599UL,   8127175UL,  560635UL,   3995819UL,  2623525UL,  13851165UL, 3614052UL,
+    15010351UL, 1866183UL,  12811482UL, 11053690UL, 1059101UL,  5657984UL,  14881517UL, 4249747UL,  14866580UL,
+    15864963UL, 14872686UL, 9778006UL,  11260912UL, 8981457UL,  6887112UL,  10153550UL, 16214219UL, 5769659UL,
+    14738611UL, 4835284UL,  2596205UL,  13245618UL, 3177520UL,  4360785UL,  8652129UL,  7133082UL,  13658312UL,
+    2961587UL,  9056203UL,  3606448UL,  3352616UL,  398292UL,   7363445UL,  15697522UL, 12159445UL, 15114111UL,
+    1723882UL,  14853573UL, 8216896UL,  8644641UL,  10717000UL, 9661839UL,  11045396UL, 4405038UL,  6960429UL,
+    16260832UL, 8817296UL,  2603606UL,  3920316UL,  13492088UL, 9871148UL,  2923981UL,  13304485UL, 11430591UL,
+    12528090UL, 4019780UL,  12947202UL, 3250217UL,  16263614UL, 12360040UL, 7256962UL,  144325UL,   7419304UL,
+    2409084UL,  8184978UL,  6045539UL,  12739762UL, 10076430UL, 13337942UL, 13676UL,    14367841UL, 5937179UL,
+    15746583UL, 15100291UL, 12823358UL, 10154174UL, 2860085UL,  8626761UL,  14723131UL, 3264395UL,  12881669UL,
+    13957605UL, 10790066UL, 8798332UL,  5509109UL,  2944282UL,  13152398UL, 14618895UL, 903228UL,   2779534UL,
+    8165565UL,  3242788UL,  16729750UL, 8149617UL,  14616272UL, 14176934UL, 8002846UL,  11306754UL, 449749UL,
+    14883215UL, 6553347UL,  295640UL,   7383912UL,  16249717UL, 11838536UL, 9651975UL,  1982392UL,  10073390UL,
+    2845639UL,  2399895UL,  11493260UL, 13626816UL, 2342751UL,  1371652UL,  3459133UL,  12244886UL, 977896UL,
+    9872315UL,  1360512UL,  10632920UL, 7608820UL,  10399820UL, 8986480UL,  13423961UL, 11037393UL, 1322052UL,
+    13506134UL, 6583154UL,  1947122UL,  10911216UL, 11884099UL, 4624520UL,  5476560UL,  12188034UL, 2996447UL,
+    15979531UL, 8738389UL,  8682347UL,  4343859UL,  4970020UL,  894371UL,   5913953UL,  14307464UL, 12576336UL,
+    15658996UL, 5219024UL,  5963417UL,  187454UL,   15806537UL, 9166645UL,  16361784UL, 9243402UL,  6052699UL,
+    14287773UL, 16008386UL, 6726317UL,  12667205UL, 1229421UL,  13521443UL, 4654415UL,  15451137UL, 193602UL,
+    4777974UL,  2745263UL,  12293999UL, 4272977UL,  10327231UL, 7783132UL,  7623915UL,  14949046UL, 10111550UL,
+    2918331UL,  6066675UL,  10003301UL, 1542396UL,  10232852UL, 6135916UL,  2003097UL,  9172622UL,  6407840UL,
+    1799346UL,  5899490UL,  14902010UL, 865405UL,   7152240UL,  8014664UL,  12166923UL, 13358788UL, 4484365UL,
+    13860856UL, 13252922UL, 13041851UL, 3971235UL,  9557888UL,  12819922UL, 11724002UL, 10559341UL, 1190890UL,
+    9763913UL,  11586191UL, 7301234UL,  2838900UL,  4479893UL,  15723915UL, 10185475UL, 8196780UL,  421893UL,
+    10433000UL, 1702906UL,  6203134UL,  3463007UL,  11491932UL, 16717658UL, 2363088UL,  4875695UL,  270996UL,
+    1745332UL,  15979547UL, 9803648UL,  14188986UL, 4552721UL,  15923984UL, 16406790UL, 956027UL,   6739111UL,
+    12058461UL, 10444934UL, 12801058UL, 2107874UL,  10887983UL, 4651629UL,  3687897UL,  4031786UL,  12978485UL,
+    14316193UL, 1488444UL,  663583UL,   6429266UL,  5666912UL,  15586757UL, 13427535UL, 1060308UL,  12705196UL,
+    7434900UL,  8708923UL,  15838589UL, 11028159UL, 8968518UL,  5389546UL,  2741429UL,  5899364UL,  12646757UL,
+    9388354UL,  16391534UL, 12776695UL, 6111746UL,  14427074UL, 15673886UL, 13998240UL, 16037740UL, 3741754UL,
+    1452987UL,  3970556UL,  13660979UL, 14968120UL, 10469944UL, 3385248UL,  11452896UL, 14649025UL, 5686792UL,
+    1244056UL,  13015433UL, 13579073UL, 13481167UL, 479225UL,   13361175UL, 9053549UL,  16131096UL, 10820067UL,
+    9957356UL,  12426019UL, 5376281UL,  2881852UL,  14724336UL, 8554039UL,  8435026UL,  5840959UL,  6144486UL,
+    10699844UL, 13081190UL, 10240798UL, 6850847UL,  3042962UL,  15576290UL, 13096097UL, 8797907UL,  367099UL,
+    10546622UL, 14341804UL, 16177643UL, 2029071UL,  7850593UL,  12405549UL, 870917UL,   677927UL,   11275008UL,
+    11492329UL, 9381720UL,  15586374UL, 4047218UL,  4835364UL,  7599974UL,  9117660UL,  1272063UL,  1191672UL,
+    7357762UL,  13937509UL, 986496UL,   3775395UL,  4090355UL,  13451720UL, 6060958UL,  8511610UL,  3031463UL,
+    5131027UL,  928835UL,   12375089UL, 9225321UL,  6897561UL,  9095078UL,  3291533UL,  1800201UL,  4400611UL,
+    1713348UL,  7617841UL,  2409205UL,  15965790UL, 15048795UL, 1255032UL,  11443846UL, 322127UL,   9820440UL,
+    3399511UL,  11298006UL, 3870761UL,  2221204UL,  4469898UL,  16088000UL, 3634853UL,  3763942UL,  9697206UL,
+    6506988UL,  886910UL,   16480322UL, 2991343UL,  5755670UL,  6608506UL,  10966589UL, 4327222UL,  5943067UL,
+    11756487UL, 11386930UL, 402035UL,   15099815UL, 2136949UL,  11286441UL, 11865807UL, 9728074UL,  9012940UL,
+    3904641UL,  16620924UL, 738932UL,   6808844UL,  5195330UL,  7802882UL,  699876UL,   7408088UL,  1153807UL,
+    1298775UL,  9388087UL,  3528032UL,  14766609UL, 11242818UL, 9133843UL,  3758999UL,  13727814UL, 11021895UL,
+    10226159UL, 11891837UL, 13600585UL, 1896459UL,  4697458UL,  7923528UL,  2597909UL,  1381925UL,  1559156UL,
+    9851395UL,  7178127UL,  8127725UL,  11669888UL, 13822429UL, 8521621UL,  11946888UL,
+};
+uint32_t rand_arr_25_b25_w32_arr[1024] = {
+    23618826UL, 30297245UL, 32649804UL, 27799455UL, 29737403UL, 13414531UL, 4131225UL,  13459949UL, 7108723UL,
+    15056683UL, 1393896UL,  32490810UL, 27945581UL, 14701246UL, 31733583UL, 25354829UL, 29081164UL, 17651038UL,
+    12351812UL, 31734415UL, 12959350UL, 27639883UL, 19203360UL, 24122368UL, 12822699UL, 3212963UL,  20332441UL,
+    31407293UL, 8491773UL,  16656181UL, 26176641UL, 26301734UL, 31540167UL, 24876380UL, 3035168UL,  7467443UL,
+    6453018UL,  8764534UL,  27774243UL, 7165377UL,  11637578UL, 11899269UL, 14025849UL, 33456782UL, 8484468UL,
+    27683779UL, 10169390UL, 15197752UL, 28973424UL, 24136973UL, 27039711UL, 24283117UL, 21811775UL, 10339938UL,
+    17354748UL, 2767299UL,  6638282UL,  6355273UL,  7022343UL,  1200512UL,  18622564UL, 3154647UL,  20072368UL,
+    7463532UL,  8964626UL,  16891072UL, 26037322UL, 12334449UL, 15758250UL, 848210UL,   32740937UL, 22590121UL,
+    17485021UL, 21410152UL, 5105538UL,  6722951UL,  33433844UL, 22830686UL, 20344483UL, 30548410UL, 10468347UL,
+    30420947UL, 27346281UL, 16988216UL, 28475358UL, 13835155UL, 2904158UL,  16226907UL, 9884076UL,  18292267UL,
+    14306325UL, 12660949UL, 7815341UL,  18675542UL, 20654079UL, 17883692UL, 20972153UL, 18731687UL, 18806289UL,
+    14643526UL, 2399924UL,  30912153UL, 27626839UL, 17316845UL, 30546304UL, 11533551UL, 28628059UL, 10406712UL,
+    5736917UL,  14684591UL, 28442538UL, 14339101UL, 20691004UL, 9684933UL,  1609982UL,  5866514UL,  10630323UL,
+    28446102UL, 30222971UL, 26094996UL, 17002140UL, 4191369UL,  21697999UL, 28789124UL, 30388926UL, 4147594UL,
+    15085502UL, 27551806UL, 998335UL,   6456320UL,  143049UL,   1575973UL,  18008306UL, 4670744UL,  175557UL,
+    16785889UL, 12255666UL, 9770708UL,  9964930UL,  25168459UL, 25627542UL, 6177672UL,  28866630UL, 32323475UL,
+    30209948UL, 31549846UL, 23145552UL, 14195939UL, 4576271UL,  21650916UL, 6500342UL,  14537584UL, 8971479UL,
+    13518185UL, 19091871UL, 3283900UL,  25540626UL, 15190258UL, 16943823UL, 8999980UL,  21628138UL, 32512118UL,
+    27558576UL, 28657939UL, 23960894UL, 13968853UL, 12957258UL, 14360738UL, 3205834UL,  23312012UL, 16840501UL,
+    3961827UL,  532018UL,   22046244UL, 20550440UL, 4996092UL,  22941456UL, 33411301UL, 7618507UL,  15607467UL,
+    15996853UL, 18726699UL, 3642791UL,  28542029UL, 30732693UL, 14165928UL, 16394128UL, 26946815UL, 14056217UL,
+    26999950UL, 15012157UL, 31539717UL, 12510840UL, 409865UL,   20634598UL, 7131121UL,  32979707UL, 27459010UL,
+    23969667UL, 8513270UL,  23886825UL, 11654034UL, 10268948UL, 14706725UL, 20914860UL, 18536524UL, 15468390UL,
+    29050776UL, 6659566UL,  7657299UL,  19260422UL, 24557892UL, 1920458UL,  16879944UL, 13005986UL, 9965073UL,
+    15457352UL, 10301178UL, 33387893UL, 28698831UL, 1800638UL,  12104550UL, 9161511UL,  4319496UL,  23285346UL,
+    1466400UL,  27376300UL, 16289567UL, 24595926UL, 17666037UL, 7478732UL,  23201202UL, 19089656UL, 26357609UL,
+    25790319UL, 15184425UL, 25167303UL, 25534920UL, 28975316UL, 12855227UL, 30782877UL, 30403606UL, 16713671UL,
+    2248385UL,  8011705UL,  1020931UL,  14498757UL, 1703785UL,  12526831UL, 28899729UL, 2026428UL,  5459001UL,
+    25031872UL, 27802902UL, 21703754UL, 18606140UL, 33033370UL, 31211617UL, 9883785UL,  1973080UL,  3204493UL,
+    929489UL,   7772133UL,  32881112UL, 7623732UL,  23732135UL, 24989823UL, 12806115UL, 1079876UL,  10199147UL,
+    27474240UL, 23365123UL, 19311591UL, 16494897UL, 12663429UL, 14826493UL, 11374353UL, 19548341UL, 20627023UL,
+    26141171UL, 16794470UL, 24057515UL, 27447772UL, 6655391UL,  28150998UL, 3436636UL,  6571885UL,  18969962UL,
+    10912326UL, 23949657UL, 3763016UL,  21049695UL, 19803251UL, 10368100UL, 9940455UL,  6429981UL,  8087976UL,
+    26294464UL, 1150379UL,  32285570UL, 22832018UL, 26070097UL, 24201843UL, 28464178UL, 2210248UL,  15114548UL,
+    5972181UL,  8019113UL,  8862496UL,  2442966UL,  24338599UL, 4949369UL,  19799591UL, 12693018UL, 15447553UL,
+    5579135UL,  10802838UL, 16523278UL, 23045660UL, 24298480UL, 13062977UL, 24947709UL, 3962585UL,  22005962UL,
+    20466065UL, 5919664UL,  16818807UL, 16570891UL, 2832786UL,  13084698UL, 2332985UL,  28453696UL, 28489358UL,
+    21989543UL, 25146595UL, 14703200UL, 24004948UL, 22884285UL, 23507436UL, 24492855UL, 8979155UL,  18067910UL,
+    12481714UL, 29495210UL, 15322018UL, 16907936UL, 6168277UL,  14906599UL, 20819045UL, 3513576UL,  26192853UL,
+    30661835UL, 2434193UL,  21486194UL, 618624UL,   27919674UL, 7232030UL,  2472833UL,  21489992UL, 25078972UL,
+    126289UL,   14101998UL, 13788953UL, 24838972UL, 15161956UL, 28231094UL, 6843863UL,  70138UL,    24185646UL,
+    25516149UL, 27286693UL, 21438388UL, 30346041UL, 12029117UL, 13170338UL, 12334215UL, 412138UL,   15570164UL,
+    27439949UL, 16115174UL, 8112125UL,  652759UL,   28373890UL, 26109581UL, 13034178UL, 32757448UL, 3308739UL,
+    4310171UL,  6115683UL,  4367180UL,  18388753UL, 1384208UL,  22829466UL, 33513479UL, 3698352UL,  11469751UL,
+    513469UL,   33263116UL, 19792138UL, 1528662UL,  8813919UL,  16917027UL, 10273139UL, 10009262UL, 4697640UL,
+    13548328UL, 27171688UL, 12783317UL, 30481439UL, 17963971UL, 2905243UL,  13256929UL, 32295737UL, 4229218UL,
+    26201794UL, 14040054UL, 11324668UL, 6302896UL,  20239798UL, 4427271UL,  1925092UL,  22969184UL, 12530720UL,
+    3028844UL,  746396UL,   32188314UL, 21770498UL, 22336289UL, 6949304UL,  8261325UL,  23424871UL, 1355168UL,
+    681924UL,   27776292UL, 6041918UL,  26357173UL, 29812412UL, 13997950UL, 15049198UL, 20886966UL, 18980032UL,
+    18957294UL, 16386347UL, 4349562UL,  29010431UL, 3444197UL,  20152140UL, 5263340UL,  18390166UL, 21063066UL,
+    28632840UL, 13143790UL, 26243109UL, 17874556UL, 7718737UL,  265552UL,   6263126UL,  25936267UL, 20818161UL,
+    11529365UL, 33370151UL, 24551248UL, 29746724UL, 32532809UL, 25201880UL, 22687940UL, 7117837UL,  31964838UL,
+    15780621UL, 31130192UL, 30044865UL, 12392918UL, 20039733UL, 14076268UL, 2591568UL,  5118942UL,  8992907UL,
+    18483569UL, 19894707UL, 27225299UL, 13002929UL, 17900624UL, 10146548UL, 28986029UL, 18335835UL, 24153670UL,
+    8099687UL,  28794239UL, 22288464UL, 25333972UL, 8179231UL,  24601701UL, 32211212UL, 32749330UL, 27937715UL,
+    26173645UL, 31447999UL, 29351363UL, 25876090UL, 22212770UL, 21173257UL, 17193155UL, 30253298UL, 29470169UL,
+    8536961UL,  23412458UL, 24410333UL, 25809525UL, 5829785UL,  13577872UL, 15031749UL, 8130775UL,  5997127UL,
+    29846799UL, 3001393UL,  17260271UL, 17559810UL, 12446901UL, 20069328UL, 12270171UL, 19387408UL, 30120949UL,
+    3221905UL,  19008578UL, 9064361UL,  15867995UL, 16310895UL, 9575024UL,  26801826UL, 32974972UL, 19803056UL,
+    1748813UL,  31468282UL, 8332141UL,  5874718UL,  22385155UL, 16418731UL, 9600476UL,  13826011UL, 29285334UL,
+    32690487UL, 14732072UL, 1816814UL,  31151270UL, 22844669UL, 24098930UL, 14977922UL, 3548870UL,  23074522UL,
+    1788703UL,  10856857UL, 1598116UL,  18647915UL, 2160349UL,  28580652UL, 23658693UL, 18451508UL, 32019202UL,
+    27804496UL, 6360215UL,  8847705UL,  28226369UL, 1354126UL,  29822689UL, 23325412UL, 14913808UL, 33287007UL,
+    7882850UL,  23633560UL, 12695974UL, 26451388UL, 2021923UL,  24272915UL, 14305112UL, 6093366UL,  5150761UL,
+    16080265UL, 24038011UL, 25713144UL, 24113894UL, 20663607UL, 1438190UL,  19908440UL, 25246187UL, 14054400UL,
+    14816954UL, 29334964UL, 4535220UL,  11844280UL, 19600683UL, 27786008UL, 26120673UL, 14831610UL, 23245087UL,
+    25192295UL, 6790295UL,  13806077UL, 5776129UL,  4260767UL,  15364398UL, 2325143UL,  15233568UL, 31074327UL,
+    1424326UL,  28773306UL, 11836616UL, 1183571UL,  4104084UL,  8442309UL,  21297292UL, 12916514UL, 3584198UL,
+    21980209UL, 1139854UL,  32314065UL, 20566417UL, 30247249UL, 22288542UL, 19328077UL, 6307339UL,  15033601UL,
+    929354UL,   13167320UL, 19176213UL, 14913701UL, 20887959UL, 29382833UL, 28329788UL, 10046445UL, 10262155UL,
+    26793194UL, 24352875UL, 2665953UL,  15125649UL, 13550062UL, 26309732UL, 14749110UL, 24652871UL, 15652453UL,
+    11379017UL, 166247UL,   12909935UL, 27473888UL, 14739181UL, 11629470UL, 7391882UL,  18298847UL, 9076031UL,
+    25790594UL, 7627376UL,  13862524UL, 6771922UL,  3432744UL,  6995344UL,  11844745UL, 11608922UL, 27405554UL,
+    27133916UL, 19873849UL, 985894UL,   19679109UL, 7744842UL,  29841830UL, 27085369UL, 18453938UL, 598464UL,
+    16095396UL, 7902189UL,  1869047UL,  29001077UL, 14637260UL, 12145105UL, 28149587UL, 32527026UL, 19742123UL,
+    14797396UL, 1207463UL,  6518289UL,  12103148UL, 15598050UL, 12344484UL, 25751355UL, 3753405UL,  3133409UL,
+    6001778UL,  18833238UL, 21005837UL, 24954078UL, 27152032UL, 4691345UL,  25958611UL, 16612951UL, 27483024UL,
+    26610545UL, 22189238UL, 15846418UL, 7099531UL,  2636474UL,  14532010UL, 31533333UL, 19577855UL, 3314520UL,
+    30112744UL, 2950333UL,  11351671UL, 11773573UL, 16763151UL, 1279054UL,  22774985UL, 4264275UL,  22067390UL,
+    1421141UL,  19923907UL, 24639933UL, 2499242UL,  3713523UL,  17792878UL, 10438213UL, 30357293UL, 15323208UL,
+    23319565UL, 14071380UL, 29519365UL, 25784127UL, 14357140UL, 29774740UL, 15978581UL, 15946501UL, 19473211UL,
+    16993195UL, 4769470UL,  28823396UL, 13082810UL, 17747986UL, 22987109UL, 12133392UL, 13943532UL, 24064491UL,
+    25323275UL, 13323519UL, 17840871UL, 20338329UL, 33341747UL, 29673454UL, 33264724UL, 17186170UL, 16002749UL,
+    16568131UL, 10818196UL, 14684990UL, 27020980UL, 12555802UL, 8289884UL,  23891812UL, 15947491UL, 2499061UL,
+    10358627UL, 8386695UL,  952224UL,   18342995UL, 29846520UL, 2642163UL,  8444443UL,  31559391UL, 10245384UL,
+    9325019UL,  26748905UL, 33142832UL, 7079270UL,  6005507UL,  32766245UL, 1012238UL,  3004977UL,  30141309UL,
+    25198553UL, 1513717UL,  13900451UL, 14683579UL, 3093305UL,  17205711UL, 2316896UL,  14386937UL, 6959129UL,
+    7507376UL,  4986839UL,  30699019UL, 7951559UL,  22145084UL, 25121173UL, 13119958UL, 7017819UL,  21009047UL,
+    30406184UL, 18848566UL, 31331327UL, 24229876UL, 16264387UL, 23968969UL, 8747707UL,  3337547UL,  32246219UL,
+    25044614UL, 1083182UL,  10794058UL, 11952171UL, 6048657UL,  2181341UL,  2390024UL,  26356891UL, 21899758UL,
+    22766584UL, 10553787UL, 19063508UL, 2414745UL,  9692381UL,  20350841UL, 22243401UL, 13097788UL, 25995534UL,
+    10386941UL, 25889489UL, 19091104UL, 1395874UL,  9231081UL,  4760640UL,  20107827UL, 30749951UL, 17637116UL,
+    23339854UL, 9902002UL,  13126456UL, 7664563UL,  32706535UL, 19929149UL, 18339168UL, 247088UL,   15226939UL,
+    24996855UL, 1425687UL,  4660955UL,  2207870UL,  22309625UL, 28688152UL, 1406027UL,  27966318UL, 32453045UL,
+    17140494UL, 9926353UL,  2739282UL,  8957923UL,  28730170UL, 6848801UL,  8198302UL,  4887146UL,  895709UL,
+    31061071UL, 7772463UL,  29470483UL, 28464993UL, 5013617UL,  33135521UL, 26366197UL, 8358547UL,  16914267UL,
+    1095757UL,  28836284UL, 22475185UL, 9511363UL,  18323616UL, 3763886UL,  10693050UL, 5980629UL,  11341113UL,
+    778550UL,   9571102UL,  22056032UL, 20777678UL, 32880250UL, 16364442UL, 26661619UL, 29043934UL, 8542587UL,
+    13015609UL, 15387585UL, 26280342UL, 26179063UL, 11819073UL, 2471402UL,  21933201UL, 18245201UL, 12019043UL,
+    16430848UL, 20515875UL, 29706660UL, 3266128UL,  13474268UL, 10321227UL, 14455830UL, 13169670UL, 7440880UL,
+    29502971UL, 19464408UL, 29448965UL, 20979666UL, 32766847UL, 23278403UL, 9668590UL,  13104382UL, 21832865UL,
+    6920098UL,  23608713UL, 3874892UL,  24718693UL, 7302335UL,  13484380UL, 18015988UL, 6545161UL,  8684682UL,
+    8348359UL,  369672UL,   18590408UL, 28000943UL, 20820887UL, 29343948UL, 33349540UL, 17678400UL, 25735729UL,
+    27928431UL, 10516091UL, 30084856UL, 5205004UL,  11248684UL, 24396107UL, 32317470UL, 33518706UL, 29266972UL,
+    11625291UL, 9412759UL,  31712518UL, 31755258UL, 4358240UL,  5669697UL,  17240115UL, 17376612UL, 24761116UL,
+    31004011UL, 1295819UL,  11236417UL, 16710365UL, 979446UL,   5965377UL,  906727UL,   26145411UL, 18059094UL,
+    27924181UL, 4879626UL,  27435459UL, 27206989UL, 14409291UL, 32965410UL, 28719251UL, 30781375UL, 25105183UL,
+    9314266UL,  22752262UL, 22709032UL, 5971958UL,  1685121UL,  5551113UL,  3208331UL,  2047685UL,  18114704UL,
+    4207818UL,  24975261UL, 31610857UL, 31789353UL, 18321058UL, 27122605UL, 8415006UL,  7276864UL,  16881434UL,
+    29897730UL, 16002888UL, 1572811UL,  27553564UL, 22165058UL, 24526178UL, 6894910UL,  19465101UL, 31343441UL,
+    10441389UL, 6859892UL,  19749936UL, 12254740UL, 19929608UL, 2905506UL,  31901431UL, 2008582UL,  6734443UL,
+    26827242UL, 22182854UL, 30219824UL, 187567UL,   1729450UL,  20764406UL, 13243901UL, 2787907UL,  13615193UL,
+    14079563UL, 12123899UL, 12562199UL, 16568587UL, 4551641UL,  13210651UL, 16042372UL, 12239593UL, 29175772UL,
+    17841280UL, 22469317UL, 31324881UL, 25678482UL, 24707444UL, 32456056UL, 520528UL,
+};
+uint32_t rand_arr_26_b26_w32_arr[1024] = {
+    1957413UL,  635781UL,   49373242UL, 4591354UL,  4579438UL,  14217022UL, 6793661UL,  58087901UL, 36362516UL,
+    13891197UL, 4137292UL,  18699064UL, 66312566UL, 27571875UL, 814498UL,   47315914UL, 9299578UL,  29446460UL,
+    23265924UL, 34655870UL, 303275UL,   41468751UL, 52159218UL, 51363854UL, 53044527UL, 65063814UL, 67028381UL,
+    30968722UL, 29460408UL, 62640273UL, 4764780UL,  31644449UL, 57512345UL, 8951599UL,  39375528UL, 21190952UL,
+    32296337UL, 50742840UL, 62217365UL, 36446983UL, 13466209UL, 25752512UL, 55264875UL, 58695190UL, 42462438UL,
+    28682821UL, 63123619UL, 42048908UL, 57472958UL, 28681571UL, 34210834UL, 61041724UL, 38006063UL, 41416171UL,
+    57231576UL, 61915212UL, 55244058UL, 57453421UL, 41247383UL, 46818762UL, 37330545UL, 4247513UL,  33822086UL,
+    7726862UL,  23640014UL, 62889748UL, 16478743UL, 64862033UL, 31727437UL, 67053588UL, 45245965UL, 18497956UL,
+    58788047UL, 39437567UL, 16312357UL, 46645820UL, 20351312UL, 17526670UL, 58935119UL, 33373695UL, 53153750UL,
+    2082572UL,  12648730UL, 10333196UL, 54594186UL, 65384407UL, 35535895UL, 3845087UL,  59315069UL, 43061627UL,
+    2249538UL,  44978457UL, 63611334UL, 52933310UL, 25653232UL, 42296114UL, 32363884UL, 23095626UL, 41782963UL,
+    50729719UL, 57880861UL, 15398823UL, 56928828UL, 52394098UL, 40545567UL, 6067596UL,  40075131UL, 22329130UL,
+    44009746UL, 10550981UL, 42114512UL, 19553384UL, 42380110UL, 63738248UL, 8616783UL,  8744850UL,  35014565UL,
+    15996408UL, 18677797UL, 32241271UL, 37402522UL, 63173708UL, 41765379UL, 63529108UL, 55116861UL, 58858866UL,
+    35903706UL, 60843773UL, 11872265UL, 9593599UL,  18329443UL, 58557404UL, 55100194UL, 59250802UL, 53274657UL,
+    11495025UL, 2789821UL,  63119015UL, 21857432UL, 2126376UL,  26153578UL, 49051178UL, 11493630UL, 38765928UL,
+    47405746UL, 64604490UL, 40940271UL, 40752600UL, 53622521UL, 9952796UL,  40748041UL, 26263150UL, 5163323UL,
+    56239944UL, 48116273UL, 30097538UL, 5574313UL,  45848828UL, 25500719UL, 46316111UL, 9119563UL,  25763699UL,
+    55193615UL, 63757332UL, 14030233UL, 13021830UL, 4537144UL,  58520361UL, 50153191UL, 20863524UL, 6149112UL,
+    66101801UL, 12727522UL, 25254067UL, 38829044UL, 29394691UL, 16102730UL, 54747231UL, 30215227UL, 11024280UL,
+    49239862UL, 44049632UL, 31149678UL, 12224121UL, 8720145UL,  9277014UL,  12419985UL, 55026537UL, 50816601UL,
+    41586071UL, 47737581UL, 10527396UL, 45636996UL, 39699399UL, 377268UL,   53157618UL, 691474UL,   24018430UL,
+    26878672UL, 4410977UL,  5002125UL,  27076387UL, 63376783UL, 7208649UL,  62004191UL, 48627505UL, 66335102UL,
+    60116195UL, 45891561UL, 8092268UL,  22847963UL, 40160611UL, 27781242UL, 11605509UL, 35207795UL, 22646992UL,
+    28895813UL, 7738377UL,  67060371UL, 38723051UL, 62847210UL, 47008461UL, 16171562UL, 7849684UL,  3632183UL,
+    19060410UL, 60652964UL, 32504629UL, 12452323UL, 28609360UL, 7550017UL,  10181666UL, 53482461UL, 46212332UL,
+    12914001UL, 35456423UL, 19544344UL, 35061498UL, 20981720UL, 28184304UL, 29345469UL, 16296453UL, 11333203UL,
+    30686335UL, 48954822UL, 22393716UL, 14479172UL, 55457742UL, 61283046UL, 61281286UL, 48980459UL, 62004187UL,
+    35244001UL, 24665437UL, 65820094UL, 22518134UL, 10997373UL, 48930165UL, 56871701UL, 23141460UL, 53868401UL,
+    66161937UL, 55467568UL, 66551955UL, 47625948UL, 15602479UL, 28433684UL, 14329574UL, 22172716UL, 53774143UL,
+    26008837UL, 10483850UL, 21472867UL, 55768823UL, 30813220UL, 13920898UL, 11367279UL, 12989266UL, 21057929UL,
+    41706684UL, 66392755UL, 50858871UL, 48472567UL, 26028718UL, 36285597UL, 51268791UL, 48312819UL, 50276950UL,
+    30667036UL, 63517170UL, 60209662UL, 43217945UL, 19491860UL, 34338808UL, 51196200UL, 49702876UL, 58528229UL,
+    36022144UL, 45770818UL, 51499199UL, 1301309UL,  64436885UL, 19782631UL, 28898184UL, 14121378UL, 32386372UL,
+    46389085UL, 59654657UL, 12248951UL, 629799UL,   1081503UL,  32852137UL, 8902188UL,  38715509UL, 28394983UL,
+    29981907UL, 22767933UL, 6990313UL,  10460339UL, 11533611UL, 14938330UL, 31053542UL, 27024349UL, 49721916UL,
+    5334356UL,  56197605UL, 7213574UL,  17264135UL, 4254033UL,  29442942UL, 39962674UL, 7968932UL,  60868227UL,
+    4078716UL,  28102921UL, 65116165UL, 9139146UL,  20505824UL, 63628298UL, 48327431UL, 40617046UL, 31748409UL,
+    45797841UL, 66689525UL, 35209626UL, 62286377UL, 24749339UL, 4727485UL,  21526814UL, 11571733UL, 12206568UL,
+    12640187UL, 38055753UL, 40830868UL, 34718734UL, 32742604UL, 31052373UL, 36189785UL, 9992775UL,  28236901UL,
+    25223305UL, 66294922UL, 55534092UL, 11266581UL, 20577092UL, 7070271UL,  66486416UL, 11566014UL, 45796665UL,
+    44546726UL, 1065959UL,  51358141UL, 60782530UL, 20761249UL, 13688470UL, 29695617UL, 2788647UL,  49490667UL,
+    64534871UL, 25332918UL, 41095293UL, 5867476UL,  28994961UL, 7600484UL,  51552479UL, 35184389UL, 1645008UL,
+    36021116UL, 37087868UL, 56500556UL, 56684408UL, 42957317UL, 35789533UL, 3273565UL,  3326133UL,  66976106UL,
+    19313762UL, 2728092UL,  27395578UL, 29137448UL, 2029835UL,  62532949UL, 37289466UL, 63960226UL, 19452337UL,
+    53408546UL, 3924058UL,  20041034UL, 50859607UL, 40692845UL, 10651150UL, 27496944UL, 40240590UL, 46047245UL,
+    3389588UL,  14929407UL, 59061797UL, 21014402UL, 30175872UL, 22335914UL, 35704176UL, 22116309UL, 51111345UL,
+    4853684UL,  2238839UL,  26243934UL, 49871280UL, 63183256UL, 61095422UL, 38127681UL, 44865568UL, 32442631UL,
+    59510776UL, 53391183UL, 20912779UL, 19985246UL, 6222285UL,  28433830UL, 18061780UL, 12413656UL, 27335885UL,
+    24412728UL, 903725UL,   5935544UL,  5305532UL,  8763156UL,  10651395UL, 16913371UL, 259407UL,   25296086UL,
+    4485460UL,  41858117UL, 36990055UL, 30581021UL, 18978572UL, 63380141UL, 19954244UL, 22829236UL, 35896655UL,
+    119231UL,   10015725UL, 41007207UL, 18052219UL, 49040064UL, 44662084UL, 46431644UL, 26115596UL, 14011028UL,
+    5282585UL,  17057160UL, 49361872UL, 66599283UL, 34594480UL, 46796122UL, 45521984UL, 7658354UL,  32798529UL,
+    29453746UL, 11305098UL, 14467873UL, 66672372UL, 59618557UL, 43131087UL, 32994824UL, 51859535UL, 66028362UL,
+    10127401UL, 13197001UL, 14526251UL, 55545418UL, 16888244UL, 25114046UL, 49838397UL, 268533UL,   15871665UL,
+    46508485UL, 37345375UL, 52044212UL, 47116627UL, 29745481UL, 62836717UL, 55978759UL, 42315526UL, 36234921UL,
+    62550692UL, 25848864UL, 27994838UL, 55975912UL, 26758077UL, 33915369UL, 3353537UL,  8251253UL,  53891087UL,
+    60064113UL, 53525283UL, 24385969UL, 21565417UL, 23002259UL, 43517062UL, 55700551UL, 10506706UL, 17595972UL,
+    4861426UL,  24356764UL, 29144789UL, 9594766UL,  37480517UL, 6307988UL,  350797UL,   7487899UL,  19012079UL,
+    46197314UL, 59024984UL, 43523219UL, 5263360UL,  46571976UL, 39088020UL, 50382851UL, 2878777UL,  10837620UL,
+    37871476UL, 18815001UL, 58844149UL, 18182223UL, 31585334UL, 56319051UL, 51128141UL, 25694117UL, 56116304UL,
+    14392255UL, 49665801UL, 65243081UL, 41924680UL, 42312197UL, 5764042UL,  30430922UL, 37157977UL, 59878944UL,
+    58644158UL, 13038433UL, 57737908UL, 1142210UL,  55941957UL, 12788607UL, 23183726UL, 18878674UL, 13911933UL,
+    27476744UL, 1183831UL,  38702636UL, 13396319UL, 54810452UL, 31372183UL, 58776246UL, 43641800UL, 7503285UL,
+    41412395UL, 39768122UL, 11637916UL, 37788400UL, 10050552UL, 62563283UL, 22529676UL, 49867454UL, 33977905UL,
+    18090751UL, 53633572UL, 18649690UL, 25982545UL, 66013788UL, 46536887UL, 37582316UL, 63249002UL, 35946800UL,
+    55600462UL, 63376975UL, 59440174UL, 44013572UL, 31666408UL, 30653174UL, 18310649UL, 53851958UL, 22643358UL,
+    7383037UL,  56644995UL, 48691093UL, 8928589UL,  39320382UL, 55884385UL, 40532527UL, 65970421UL, 18745314UL,
+    52265037UL, 8689677UL,  22205616UL, 46258112UL, 48386659UL, 52382081UL, 1935908UL,  58012942UL, 52652097UL,
+    66896786UL, 47060066UL, 3040000UL,  1734517UL,  8675248UL,  47510853UL, 8500292UL,  7574981UL,  37974002UL,
+    12948258UL, 60259568UL, 52885751UL, 61602621UL, 44642950UL, 40922506UL, 19961914UL, 16819111UL, 50606577UL,
+    44183010UL, 52757134UL, 27125734UL, 9495068UL,  33862978UL, 2958931UL,  24584532UL, 48253746UL, 14560685UL,
+    29962252UL, 18552054UL, 39184230UL, 34690376UL, 31724095UL, 51477095UL, 11320168UL, 45447805UL, 1538531UL,
+    22658148UL, 28501612UL, 56418713UL, 45432976UL, 61573158UL, 35915965UL, 62924643UL, 61835184UL, 66037345UL,
+    45574681UL, 45555753UL, 49401855UL, 3640604UL,  9437704UL,  35325567UL, 58321672UL, 10130775UL, 2483321UL,
+    51889650UL, 25223304UL, 335367UL,   39884627UL, 66155254UL, 64417239UL, 66611673UL, 601989UL,   5878726UL,
+    35913677UL, 57040158UL, 44829119UL, 14510356UL, 50146286UL, 52393406UL, 16509093UL, 25093212UL, 14380779UL,
+    20811377UL, 57766462UL, 63217002UL, 5290909UL,  17165473UL, 38156729UL, 24838023UL, 32055281UL, 28725776UL,
+    27546999UL, 1924816UL,  21016835UL, 26003542UL, 44407245UL, 27565064UL, 27863746UL, 55062807UL, 4455451UL,
+    34326551UL, 25148145UL, 58510994UL, 38330820UL, 16109064UL, 25642320UL, 2788663UL,  24219776UL, 49160771UL,
+    12311648UL, 1732781UL,  41797270UL, 9270469UL,  56463563UL, 31673609UL, 57040699UL, 62387117UL, 40935906UL,
+    50624274UL, 2602071UL,  25752656UL, 61071823UL, 22243091UL, 58180225UL, 27400290UL, 14107908UL, 57074706UL,
+    40701053UL, 59236578UL, 64602402UL, 10986436UL, 24917826UL, 40872540UL, 5913498UL,  37778507UL, 35778861UL,
+    6440192UL,  32477364UL, 22360364UL, 31529494UL, 48107759UL, 7108955UL,  25451992UL, 54569725UL, 3651800UL,
+    9525725UL,  52738671UL, 25365771UL, 18183123UL, 53606866UL, 34896340UL, 12739788UL, 65483273UL, 57037618UL,
+    28388582UL, 19169096UL, 38280241UL, 45850318UL, 29672079UL, 54725291UL, 3807567UL,  25699447UL, 34735587UL,
+    15032092UL, 36517928UL, 46845425UL, 6750175UL,  24001318UL, 10977940UL, 36536672UL, 16943150UL, 25516432UL,
+    12164318UL, 9026175UL,  62252098UL, 16809795UL, 33622621UL, 55334478UL, 10933695UL, 53705756UL, 62176640UL,
+    63730106UL, 37209969UL, 36222978UL, 49590644UL, 9033250UL,  11204691UL, 25959583UL, 45591413UL, 12099529UL,
+    2111989UL,  2055877UL,  9227698UL,  15592820UL, 1300255UL,  33805130UL, 64557236UL, 2018656UL,  16272955UL,
+    60353772UL, 14746981UL, 28850146UL, 3491739UL,  48256727UL, 19987371UL, 53883025UL, 12237394UL, 34172171UL,
+    58262299UL, 56403535UL, 39927257UL, 42283381UL, 18387008UL, 46505477UL, 3429571UL,  53751396UL, 31551631UL,
+    25355068UL, 61038212UL, 29814775UL, 21628015UL, 14512659UL, 29825230UL, 38679879UL, 22983132UL, 55212307UL,
+    14135283UL, 65615182UL, 55879256UL, 63543316UL, 6871278UL,  28672306UL, 25893871UL, 34732552UL, 9895544UL,
+    35067633UL, 20660941UL, 17682631UL, 25695494UL, 31556740UL, 44897854UL, 11580354UL, 34941630UL, 34033520UL,
+    7095292UL,  49715318UL, 38684354UL, 57121208UL, 12137640UL, 22538619UL, 56672562UL, 31738465UL, 19148820UL,
+    33325457UL, 38582794UL, 65313907UL, 23416531UL, 35308257UL, 30972211UL, 11825398UL, 39123482UL, 63049003UL,
+    11141101UL, 56043216UL, 31167806UL, 8518224UL,  55544006UL, 46425529UL, 57151537UL, 50316680UL, 23445705UL,
+    53590623UL, 2916133UL,  12856419UL, 27298363UL, 63800139UL, 34400443UL, 33185740UL, 16226947UL, 43906140UL,
+    49138331UL, 39373171UL, 34965218UL, 62979746UL, 21693499UL, 32423579UL, 11935627UL, 56390007UL, 56483046UL,
+    37460858UL, 59796635UL, 12260392UL, 31852598UL, 12099029UL, 51113358UL, 49812993UL, 23657139UL, 1187178UL,
+    16657455UL, 9078054UL,  36988060UL, 59425056UL, 6074825UL,  39713448UL, 17065837UL, 28703102UL, 20298822UL,
+    59727941UL, 18886155UL, 16565910UL, 10423092UL, 58054972UL, 25079352UL, 61135129UL, 57294084UL, 33128498UL,
+    22811367UL, 15464748UL, 10972241UL, 45225866UL, 26687754UL, 8717476UL,  1528461UL,  61435254UL, 49553158UL,
+    43429029UL, 52482336UL, 5722452UL,  5172649UL,  62205261UL, 49310247UL, 29167705UL, 20455151UL, 10913501UL,
+    57650858UL, 41999226UL, 59927181UL, 20210638UL, 50881861UL, 8431169UL,  18252955UL, 19977928UL, 24126426UL,
+    12105520UL, 29466349UL, 39888276UL, 15548512UL, 28189047UL, 47875185UL, 64276375UL, 5621570UL,  312501UL,
+    31836489UL, 14002280UL, 42532107UL, 15373194UL, 61766626UL, 63059807UL, 58516609UL, 41471256UL, 54084322UL,
+    53531118UL, 19008511UL, 34429453UL, 38021204UL, 7904452UL,  54142867UL, 38590129UL, 56075063UL, 3829119UL,
+    36659029UL, 3373021UL,  61763137UL, 56225118UL, 10608110UL, 16822364UL, 1842122UL,  53943834UL, 41974173UL,
+    8016112UL,  27382825UL, 28483525UL, 52773583UL, 49753668UL, 63830135UL, 5494255UL,  60949156UL, 27729490UL,
+    23554875UL, 37673665UL, 22154082UL, 51302170UL, 55141985UL, 19597563UL, 29596614UL, 59253463UL, 63526227UL,
+    51229467UL, 57300066UL, 20357153UL, 30730002UL, 33102779UL, 63877265UL, 34321871UL, 34140567UL, 40675236UL,
+    29446127UL, 30953472UL, 53207926UL, 59688678UL, 35878188UL, 30271950UL, 22674402UL,
+};
+uint32_t rand_arr_27_b27_w32_arr[1024] = {
+    118921869UL, 43826944UL,  29253024UL,  105969257UL, 103409663UL, 105111727UL, 61160869UL,  32956159UL,  7908978UL,
+    81888362UL,  29353972UL,  82578341UL,  28149230UL,  31369712UL,  125481670UL, 54434325UL,  89844203UL,  43403077UL,
+    30427756UL,  125733497UL, 2581207UL,   96661462UL,  29766512UL,  100415157UL, 45363120UL,  120655532UL, 52510791UL,
+    23636618UL,  49914096UL,  61733464UL,  51466814UL,  13582622UL,  39046724UL,  125652825UL, 37889078UL,  94747792UL,
+    113937511UL, 63691680UL,  52695970UL,  45210986UL,  39606585UL,  46767067UL,  73724160UL,  73343633UL,  17800210UL,
+    125935100UL, 66648948UL,  98132609UL,  67595427UL,  108841335UL, 81475748UL,  114433302UL, 133071953UL, 4180528UL,
+    116843021UL, 27288148UL,  33931800UL,  75374358UL,  121808134UL, 56042062UL,  116024677UL, 88672932UL,  88650657UL,
+    35588758UL,  64089487UL,  61356336UL,  36213305UL,  127937624UL, 116098597UL, 98430702UL,  121665605UL, 6687271UL,
+    105083000UL, 18665779UL,  74908481UL,  2566149UL,   93554088UL,  70939662UL,  29326378UL,  19684365UL,  33632185UL,
+    3474459UL,   13911379UL,  122097073UL, 120953749UL, 116967925UL, 56749940UL,  71603651UL,  77273819UL,  35017697UL,
+    110259069UL, 52627176UL,  79140814UL,  120626110UL, 33613817UL,  66125189UL,  81021544UL,  59727012UL,  57872671UL,
+    40849979UL,  59433951UL,  25804390UL,  129495428UL, 23029202UL,  65244079UL,  16922780UL,  78324387UL,  86746657UL,
+    66385057UL,  18371251UL,  99677287UL,  97608484UL,  74845549UL,  83727609UL,  118283398UL, 85352981UL,  54897229UL,
+    32970369UL,  78699982UL,  94565365UL,  112291154UL, 38183666UL,  21551667UL,  9423599UL,   102784684UL, 50311771UL,
+    2189803UL,   20282723UL,  130013559UL, 2758009UL,   76421116UL,  61143570UL,  21899453UL,  32525252UL,  48692592UL,
+    96176068UL,  92875894UL,  106600627UL, 89437309UL,  131943322UL, 72577491UL,  74720108UL,  62089879UL,  105282079UL,
+    114504821UL, 46193373UL,  81172766UL,  99510835UL,  100520022UL, 77593386UL,  124845330UL, 119201270UL, 45114574UL,
+    82247140UL,  66302739UL,  85162737UL,  23606617UL,  8282568UL,   65300219UL,  86202685UL,  98752745UL,  128091854UL,
+    89153521UL,  57408847UL,  112650299UL, 107374548UL, 92635080UL,  45375246UL,  17281516UL,  123929370UL, 61942771UL,
+    21469741UL,  84502090UL,  132572061UL, 24174542UL,  103475603UL, 17456203UL,  107706431UL, 15555218UL,  40902174UL,
+    96569552UL,  90161696UL,  108047388UL, 34829724UL,  41040722UL,  118838661UL, 113736605UL, 60241038UL,  6616112UL,
+    10356146UL,  83153671UL,  26734233UL,  71843035UL,  32861264UL,  104770172UL, 60547416UL,  113380825UL, 50195679UL,
+    97186900UL,  118385261UL, 118316427UL, 49575756UL,  344195UL,    92014643UL,  31715963UL,  33069387UL,  39577309UL,
+    95618640UL,  119310365UL, 24626714UL,  13756561UL,  21964393UL,  22979085UL,  40322982UL,  124905475UL, 127534097UL,
+    121465673UL, 92980921UL,  101820500UL, 12095743UL,  7417953UL,   75557393UL,  66925518UL,  122546990UL, 129573490UL,
+    41494746UL,  73289619UL,  56430532UL,  60759976UL,  60000176UL,  63379283UL,  24416922UL,  106180821UL, 56181310UL,
+    88559471UL,  46630287UL,  48987523UL,  101950989UL, 106163722UL, 65037466UL,  105524790UL, 69058249UL,  119252565UL,
+    3280995UL,   93933157UL,  122601479UL, 64300747UL,  50550754UL,  31847847UL,  85656886UL,  54730360UL,  53364854UL,
+    34997664UL,  64052122UL,  85596232UL,  16061531UL,  79919063UL,  121957095UL, 44059365UL,  83004049UL,  118121706UL,
+    22200196UL,  127608034UL, 40571992UL,  6724853UL,   26360955UL,  1075648UL,   94364142UL,  12089264UL,  133521610UL,
+    53570235UL,  8153876UL,   98785687UL,  54863619UL,  113445402UL, 131431833UL, 78203660UL,  122624232UL, 53061422UL,
+    104681519UL, 110803254UL, 125423149UL, 92862884UL,  77424829UL,  37585283UL,  27662411UL,  11111636UL,  127823875UL,
+    65712445UL,  104891892UL, 39121293UL,  122385942UL, 25292238UL,  18460693UL,  96405090UL,  97182316UL,  102128368UL,
+    8136260UL,   106914676UL, 20187010UL,  32200889UL,  13170006UL,  47221315UL,  90631052UL,  15339181UL,  133710203UL,
+    47601948UL,  122633326UL, 19130583UL,  79706585UL,  12740387UL,  14298442UL,  47279194UL,  102794616UL, 54804734UL,
+    78311626UL,  101645308UL, 103886983UL, 127851537UL, 117540909UL, 105842018UL, 90879596UL,  37147420UL,  32458676UL,
+    36757850UL,  60751882UL,  2094945UL,   83046431UL,  21583503UL,  109154015UL, 95059509UL,  107988677UL, 93402391UL,
+    72829934UL,  133333299UL, 110589832UL, 14661259UL,  17732688UL,  70714014UL,  16221843UL,  88327643UL,  38213541UL,
+    41378534UL,  49435628UL,  16284951UL,  8372340UL,   110381448UL, 93921563UL,  123798022UL, 41151869UL,  62778843UL,
+    86298829UL,  130529126UL, 31608824UL,  109842390UL, 85463203UL,  15234211UL,  69522935UL,  22651411UL,  36496799UL,
+    87202569UL,  109067383UL, 42443967UL,  119093565UL, 41436117UL,  23821591UL,  16822054UL,  25927945UL,  108665405UL,
+    54658998UL,  42519173UL,  64449545UL,  61443172UL,  8937992UL,   30673680UL,  9749249UL,   41128145UL,  130809323UL,
+    103695818UL, 71054404UL,  94345528UL,  106058530UL, 112781671UL, 71183863UL,  19169399UL,  45256995UL,  22777588UL,
+    93162731UL,  126979742UL, 22658285UL,  120148724UL, 38031114UL,  115764612UL, 35125576UL,  15871634UL,  113881241UL,
+    6888643UL,   9280298UL,   41625532UL,  121879867UL, 128884336UL, 47126307UL,  1213740UL,   65439199UL,  91681263UL,
+    68452714UL,  106363313UL, 109065859UL, 81981098UL,  110311368UL, 128062031UL, 65925397UL,  82844066UL,  2334723UL,
+    105769595UL, 73443145UL,  119754055UL, 113369057UL, 104044759UL, 78228891UL,  30947568UL,  130658555UL, 46410887UL,
+    104765074UL, 83813407UL,  80429879UL,  68188019UL,  53763581UL,  17196116UL,  90266618UL,  78825599UL,  89375923UL,
+    85522978UL,  97982694UL,  119040123UL, 46823521UL,  125983877UL, 37892069UL,  54051517UL,  104748697UL, 9769824UL,
+    115422576UL, 120880213UL, 115571356UL, 40063922UL,  61844751UL,  6253677UL,   57302157UL,  57064010UL,  35322503UL,
+    62341291UL,  127907121UL, 31745316UL,  5700044UL,   66388033UL,  32436049UL,  60219716UL,  54010213UL,  122827537UL,
+    43788975UL,  5288502UL,   54688542UL,  76797671UL,  25202779UL,  94651253UL,  45483099UL,  14918620UL,  93992809UL,
+    70378759UL,  85368519UL,  69648043UL,  59964694UL,  128396282UL, 77053300UL,  48819130UL,  67961125UL,  60952395UL,
+    120582691UL, 53737202UL,  48129411UL,  26748274UL,  37579422UL,  104408590UL, 112737085UL, 1059307UL,   51673360UL,
+    18138486UL,  44313459UL,  118958437UL, 15335083UL,  99979347UL,  59926095UL,  87152218UL,  89980587UL,  36099213UL,
+    53565620UL,  33126955UL,  110586644UL, 64999265UL,  42824152UL,  50114418UL,  12045061UL,  64755314UL,  64549131UL,
+    24813319UL,  95378460UL,  109866152UL, 128442171UL, 28017577UL,  77117096UL,  85816228UL,  33343743UL,  122508673UL,
+    26872841UL,  82146071UL,  24518345UL,  80529204UL,  42642294UL,  46078442UL,  58126932UL,  23130772UL,  20609474UL,
+    55105123UL,  75547939UL,  94736170UL,  3957158UL,   29864936UL,  55836045UL,  15460678UL,  45414872UL,  52847061UL,
+    98418859UL,  116573555UL, 33491168UL,  107570648UL, 62458937UL,  13559804UL,  117664156UL, 53472993UL,  116705139UL,
+    127588523UL, 130042787UL, 73099349UL,  70242606UL,  94627514UL,  111980641UL, 46936817UL,  110490001UL, 56832203UL,
+    13075555UL,  85098515UL,  41121162UL,  28156230UL,  126200759UL, 8717361UL,   89591656UL,  71249871UL,  104901235UL,
+    14169741UL,  73917260UL,  35961089UL,  88656359UL,  11669721UL,  35687023UL,  83109513UL,  131110741UL, 7490207UL,
+    28436672UL,  75536484UL,  78410584UL,  61111417UL,  31063357UL,  107082220UL, 133025816UL, 121058329UL, 64421847UL,
+    44722418UL,  54641390UL,  77693800UL,  115332191UL, 31394981UL,  88082138UL,  71864121UL,  101295737UL, 10231199UL,
+    10293174UL,  107676207UL, 114250085UL, 96892477UL,  85862131UL,  41035240UL,  42586702UL,  58121398UL,  78557084UL,
+    73835612UL,  90464299UL,  133918392UL, 56861734UL,  37027705UL,  63923153UL,  114045923UL, 95498996UL,  64345536UL,
+    70835954UL,  2148941UL,   53209844UL,  100645145UL, 85408832UL,  89691738UL,  56796311UL,  109115867UL, 37154640UL,
+    71034471UL,  131493404UL, 72720232UL,  88812819UL,  86301125UL,  79619779UL,  132120691UL, 69968318UL,  21397084UL,
+    66546374UL,  51638271UL,  22590607UL,  68235960UL,  41831452UL,  80472783UL,  79180995UL,  85152299UL,  966785UL,
+    123396671UL, 128230121UL, 106296049UL, 126878151UL, 6549374UL,   13518093UL,  75360888UL,  22610291UL,  50622150UL,
+    55871357UL,  119757375UL, 121062693UL, 24046161UL,  85948833UL,  14723258UL,  112117822UL, 35560441UL,  31507742UL,
+    112207383UL, 89239631UL,  64785420UL,  39421493UL,  32577480UL,  93360740UL,  33448209UL,  130155789UL, 78582880UL,
+    29580016UL,  115788260UL, 102755407UL, 37618709UL,  81274232UL,  86388433UL,  54072286UL,  116054033UL, 101513656UL,
+    57577482UL,  121887613UL, 43489349UL,  92054434UL,  133221735UL, 50805556UL,  41373067UL,  20981477UL,  51940558UL,
+    18486190UL,  66862081UL,  122171078UL, 93870368UL,  125194116UL, 78834495UL,  29365669UL,  13845053UL,  25813322UL,
+    128824379UL, 16016847UL,  105887825UL, 92113290UL,  106349204UL, 55670434UL,  104441677UL, 9305496UL,   71772143UL,
+    40411226UL,  41553962UL,  52965789UL,  45682998UL,  108049050UL, 10660568UL,  62434589UL,  59324573UL,  30225079UL,
+    27502701UL,  24842793UL,  36775533UL,  64038361UL,  31243131UL,  39906342UL,  35255640UL,  117787677UL, 13099337UL,
+    25173531UL,  46696667UL,  76248249UL,  119834657UL, 118045403UL, 28813899UL,  82349890UL,  15856837UL,  66516613UL,
+    115828876UL, 119743941UL, 104372616UL, 63266898UL,  108887074UL, 4602495UL,   95818772UL,  106472634UL, 81785811UL,
+    84400846UL,  95594080UL,  80881871UL,  49443441UL,  79643629UL,  64454157UL,  118725683UL, 61789102UL,  5737748UL,
+    115441428UL, 68294042UL,  20531925UL,  47893871UL,  32398153UL,  7244751UL,   2446724UL,   55267909UL,  122076048UL,
+    103683291UL, 54407103UL,  88273622UL,  93720923UL,  35192327UL,  22703106UL,  74713865UL,  91724471UL,  53315434UL,
+    62160484UL,  39744131UL,  8079641UL,   50207472UL,  11505006UL,  94262681UL,  10669353UL,  124678429UL, 61546688UL,
+    62163262UL,  71340469UL,  97535261UL,  74515097UL,  30836421UL,  106840498UL, 63928913UL,  45248648UL,  55503058UL,
+    41625620UL,  29287535UL,  25747476UL,  125625233UL, 3662249UL,   53924749UL,  93578035UL,  75155751UL,  70632684UL,
+    70421070UL,  46522081UL,  94539591UL,  6851137UL,   56509468UL,  30630317UL,  65891610UL,  70341470UL,  15598513UL,
+    1167180UL,   45307242UL,  9279383UL,   134064447UL, 33354520UL,  58848025UL,  22833778UL,  45574715UL,  78622046UL,
+    54216066UL,  34680931UL,  67292584UL,  12174541UL,  63607926UL,  33002419UL,  76414264UL,  117656264UL, 10385426UL,
+    103614121UL, 93408420UL,  77786387UL,  67352228UL,  58998083UL,  121758902UL, 25218056UL,  11253731UL,  72861834UL,
+    7518123UL,   65110389UL,  46094065UL,  56129495UL,  55718544UL,  18018732UL,  105264780UL, 45265980UL,  60792304UL,
+    61349770UL,  59350042UL,  16865066UL,  77339657UL,  78958450UL,  18228251UL,  118862807UL, 77012364UL,  30477420UL,
+    75748984UL,  131707934UL, 69897262UL,  64938395UL,  87945968UL,  19651849UL,  77183740UL,  63448418UL,  47331527UL,
+    124401916UL, 35082988UL,  26095858UL,  94275610UL,  126977488UL, 105393999UL, 112920837UL, 114898840UL, 72166489UL,
+    41390684UL,  100678520UL, 69422706UL,  66437830UL,  17027446UL,  64540755UL,  14684957UL,  92129270UL,  60882269UL,
+    126210863UL, 89967355UL,  35698173UL,  24534219UL,  80434454UL,  1608753UL,   78902265UL,  75143027UL,  125578710UL,
+    123611479UL, 78796020UL,  131843519UL, 61624263UL,  59774104UL,  12965165UL,  88896862UL,  132061254UL, 26572617UL,
+    108545009UL, 16508131UL,  120061065UL, 11390064UL,  35217451UL,  53032964UL,  11650565UL,  97887011UL,  105144707UL,
+    71702103UL,  75595494UL,  44389543UL,  121189294UL, 84479803UL,  59326456UL,  7814517UL,   79642728UL,  72886677UL,
+    22243774UL,  52441498UL,  27583705UL,  96730952UL,  107337910UL, 128572519UL, 102928816UL, 28516191UL,  64854197UL,
+    55000847UL,  133545857UL, 105239624UL, 122591031UL, 131437883UL, 35054516UL,  83388902UL,  40855682UL,  51136226UL,
+    100255426UL, 6655135UL,   105625149UL, 87365728UL,  79175770UL,  123648874UL, 19662711UL,  60948135UL,  26216635UL,
+    52313655UL,  91497529UL,  49944737UL,  74330462UL,  131684496UL, 111526810UL, 27568398UL,  125288947UL, 55568222UL,
+    30067024UL,  43445028UL,  83699772UL,  36333342UL,  111136345UL, 50962647UL,  89671121UL,  35352724UL,  80459125UL,
+    95851798UL,  11118423UL,  44220246UL,  1165717UL,   10212145UL,  81547111UL,  31966702UL,  132831307UL, 85165816UL,
+    51612493UL,  27569906UL,  96008924UL,  87088329UL,  68540576UL,  114162848UL, 15190324UL,  97836966UL,  112221324UL,
+    47641207UL,  128478809UL, 78797949UL,  53994995UL,  70784865UL,  47925310UL,  83444720UL,  20564192UL,  45410159UL,
+    11768875UL,  104861001UL, 60631799UL,  1687029UL,   65584530UL,  114978782UL, 60766296UL,  37428446UL,  92451467UL,
+    3550245UL,   48870807UL,  92441253UL,  25875829UL,  40515638UL,  36986225UL,  131983697UL, 28413906UL,  97844707UL,
+    35885595UL,  89021130UL,  124865726UL, 112266096UL, 14024620UL,  95007114UL,  92693444UL,  13442284UL,  132559591UL,
+    12364395UL,  52878734UL,  69272682UL,  122778427UL, 67786673UL,  24960330UL,  96407661UL,  58097765UL,  116058492UL,
+    281395UL,    113443462UL, 126701157UL, 90199231UL,  73489318UL,  18863781UL,  131565599UL, 41216946UL,  73375194UL,
+    128591691UL, 23129635UL,  54180484UL,  54080228UL,  94989541UL,  94053655UL,  113944922UL,
+};
+uint32_t rand_arr_28_b28_w32_arr[1024] = {
+    91382563UL,  17405797UL,  152141391UL, 172807831UL, 27356511UL,  242210864UL, 5689526UL,   38783690UL,  159880704UL,
+    90666658UL,  28917423UL,  177251879UL, 249431042UL, 176111207UL, 240442505UL, 58715382UL,  264765558UL, 104302170UL,
+    96504013UL,  210754856UL, 206969166UL, 208077324UL, 155692283UL, 47113357UL,  83094796UL,  106752570UL, 17754531UL,
+    214858030UL, 68216004UL,  260009325UL, 105701343UL, 246866944UL, 61465413UL,  115884631UL, 163733102UL, 165612641UL,
+    40351876UL,  215189222UL, 96506725UL,  59582743UL,  234483547UL, 26291509UL,  9722969UL,   202009261UL, 20212434UL,
+    98185930UL,  219788848UL, 140390497UL, 5138224UL,   226852982UL, 212192625UL, 21848805UL,  125514029UL, 70791878UL,
+    10771196UL,  190951659UL, 56768510UL,  19642318UL,  17226642UL,  107292674UL, 208376701UL, 188194976UL, 225125525UL,
+    6711114UL,   62309141UL,  137573294UL, 179271510UL, 107276935UL, 140093121UL, 267496169UL, 268192033UL, 151628957UL,
+    39070507UL,  231924374UL, 182620240UL, 114621064UL, 121526143UL, 161450169UL, 32259973UL,  57699268UL,  208142650UL,
+    165486791UL, 167367031UL, 216645182UL, 31552828UL,  162972161UL, 230693UL,    265370768UL, 124098599UL, 249157327UL,
+    118640950UL, 140216326UL, 209653342UL, 245581265UL, 227820420UL, 209839161UL, 196727518UL, 143227436UL, 46324389UL,
+    81135901UL,  17914104UL,  87534298UL,  142808139UL, 225672009UL, 267849147UL, 261304854UL, 53718127UL,  244940026UL,
+    230034165UL, 149218102UL, 21154215UL,  67753673UL,  92313470UL,  170763865UL, 75572767UL,  243591651UL, 147234933UL,
+    106708605UL, 215901899UL, 205302000UL, 236031771UL, 94668921UL,  145592061UL, 121420815UL, 120482216UL, 28792984UL,
+    33035205UL,  5362436UL,   141305200UL, 112326406UL, 129639618UL, 244217976UL, 89424236UL,  241624084UL, 29253052UL,
+    182764692UL, 168348740UL, 188152742UL, 183184494UL, 223171535UL, 58539531UL,  144001209UL, 63468095UL,  212572748UL,
+    20883962UL,  233081633UL, 72429073UL,  142730784UL, 93352849UL,  198902686UL, 58673524UL,  169636941UL, 173342133UL,
+    110323277UL, 160190889UL, 200929037UL, 87230043UL,  184027550UL, 40476532UL,  6383101UL,   172974512UL, 78856790UL,
+    217444197UL, 98481365UL,  155646350UL, 169774744UL, 119146723UL, 159897946UL, 62395627UL,  31581892UL,  4884819UL,
+    88386933UL,  230611860UL, 102771213UL, 178647370UL, 190159314UL, 24567516UL,  45855883UL,  224790870UL, 6318082UL,
+    61050602UL,  146897132UL, 152926937UL, 244253333UL, 222434384UL, 95868186UL,  223376669UL, 88746084UL,  260483156UL,
+    85102796UL,  122483754UL, 193684605UL, 243517347UL, 144831135UL, 64583091UL,  8180880UL,   121191053UL, 118993683UL,
+    143184797UL, 255109515UL, 63311582UL,  165962156UL, 59358598UL,  38769305UL,  14411724UL,  267595544UL, 165564636UL,
+    223902245UL, 44839797UL,  21541269UL,  201492933UL, 250660384UL, 189505687UL, 219641060UL, 42825716UL,  156538096UL,
+    179835859UL, 103532045UL, 212203467UL, 218742020UL, 173498912UL, 265299885UL, 152450268UL, 57945193UL,  147555891UL,
+    6997961UL,   17345974UL,  145897912UL, 51051371UL,  14609165UL,  69821982UL,  54455500UL,  152819782UL, 117235758UL,
+    51967985UL,  234643441UL, 163062892UL, 103338176UL, 108508838UL, 190500888UL, 28713765UL,  234549548UL, 113124476UL,
+    6608199UL,   197276002UL, 120441180UL, 63313820UL,  25552977UL,  77745150UL,  265595176UL, 155480412UL, 41108492UL,
+    31021011UL,  113650739UL, 249865869UL, 132449442UL, 44316319UL,  224431837UL, 11993408UL,  167813881UL, 171830564UL,
+    36840790UL,  58428864UL,  137943844UL, 3614772UL,   4906254UL,   69753819UL,  171083281UL, 20121192UL,  175831217UL,
+    221945560UL, 147737074UL, 257407312UL, 141815602UL, 265934384UL, 141278855UL, 94392887UL,  13772082UL,  142084721UL,
+    124324078UL, 65649527UL,  99557427UL,  140035357UL, 3453167UL,   169102567UL, 97074483UL,  36655901UL,  90537915UL,
+    124131714UL, 91016347UL,  179391892UL, 201605899UL, 96619846UL,  80356915UL,  135478149UL, 58724031UL,  165550440UL,
+    56590521UL,  25042927UL,  230590526UL, 200133837UL, 11437918UL,  234628337UL, 204308878UL, 175507178UL, 225655994UL,
+    182730835UL, 220779444UL, 20781435UL,  48109254UL,  21740473UL,  23628707UL,  165318429UL, 230091236UL, 55918688UL,
+    251699976UL, 112186278UL, 26776313UL,  226642704UL, 239023395UL, 132216092UL, 97167305UL,  129009630UL, 29640248UL,
+    149457430UL, 213788803UL, 113641393UL, 215058784UL, 256832579UL, 109409934UL, 112551387UL, 31647599UL,  194838663UL,
+    86493186UL,  176078568UL, 138348302UL, 224705931UL, 97415780UL,  183654137UL, 188488578UL, 215871916UL, 224635913UL,
+    150023406UL, 81223024UL,  83722194UL,  69849399UL,  128737036UL, 239365950UL, 33588814UL,  192989528UL, 771019UL,
+    101758383UL, 40414511UL,  178663416UL, 151206201UL, 82037834UL,  161567886UL, 65421066UL,  132879802UL, 156975038UL,
+    17061502UL,  214892917UL, 114866521UL, 98469847UL,  56589539UL,  132493076UL, 87182437UL,  139661513UL, 23453289UL,
+    108650174UL, 1351573UL,   137328282UL, 201285674UL, 105332226UL, 8176219UL,   242720022UL, 31058776UL,  127904736UL,
+    164071411UL, 163365324UL, 240440729UL, 115526215UL, 59656026UL,  90441034UL,  121826335UL, 30503429UL,  50674559UL,
+    196385671UL, 267194340UL, 92048449UL,  144477203UL, 222146252UL, 58541842UL,  215114489UL, 41881896UL,  27704283UL,
+    239112001UL, 48057431UL,  162392273UL, 86103362UL,  472468UL,    220148627UL, 202687166UL, 224193249UL, 19460795UL,
+    65711151UL,  167492855UL, 241209542UL, 135533180UL, 237497645UL, 134566883UL, 228262025UL, 260136235UL, 222439155UL,
+    210928436UL, 132738859UL, 120984340UL, 44074078UL,  25001471UL,  223915292UL, 96686789UL,  21660310UL,  228145905UL,
+    161705155UL, 50172908UL,  80566528UL,  11691181UL,  238587869UL, 5687596UL,   34513601UL,  95148826UL,  221487713UL,
+    70626483UL,  148786310UL, 179319210UL, 182687350UL, 12123553UL,  247244190UL, 236673326UL, 234867538UL, 30256865UL,
+    44866351UL,  23672121UL,  99018188UL,  88428349UL,  200281200UL, 85412330UL,  156004331UL, 189031473UL, 126067537UL,
+    78645965UL,  236546024UL, 129124869UL, 80382336UL,  84948873UL,  37889925UL,  22461957UL,  129322498UL, 35045040UL,
+    69575712UL,  49508996UL,  88519759UL,  67690469UL,  57408542UL,  40841593UL,  232420320UL, 178360312UL, 88110132UL,
+    63848458UL,  56817490UL,  148867776UL, 157345307UL, 223535431UL, 212497799UL, 21797050UL,  95129740UL,  75871872UL,
+    108897154UL, 261839031UL, 134472088UL, 40263979UL,  97943230UL,  45546175UL,  197461711UL, 214059618UL, 28091759UL,
+    84808188UL,  165359484UL, 190178841UL, 240373888UL, 151502201UL, 146470617UL, 42709932UL,  99955856UL,  140099174UL,
+    243110057UL, 107941276UL, 103748510UL, 4350682UL,   89188131UL,  58343398UL,  27467286UL,  134932899UL, 51617848UL,
+    188453249UL, 223845646UL, 244635246UL, 119909890UL, 239965961UL, 233950227UL, 212762844UL, 206814166UL, 161586687UL,
+    80573272UL,  138681647UL, 238855441UL, 11482194UL,  33895193UL,  35254814UL,  122540738UL, 96289943UL,  47498520UL,
+    203972128UL, 98530840UL,  94283670UL,  143777155UL, 113972243UL, 190630797UL, 251880562UL, 174099143UL, 19077091UL,
+    218551453UL, 95695980UL,  121656388UL, 93560975UL,  172949121UL, 143408415UL, 241090559UL, 90865783UL,  133699895UL,
+    128555946UL, 134572279UL, 12977079UL,  251167800UL, 531582UL,    201268010UL, 261386227UL, 148377795UL, 139420711UL,
+    36584125UL,  64663464UL,  115049215UL, 31806777UL,  52779924UL,  92151552UL,  70755119UL,  34450093UL,  148017251UL,
+    146650795UL, 34714169UL,  219752266UL, 136377159UL, 39093774UL,  209153260UL, 119169096UL, 129017743UL, 27830464UL,
+    88292687UL,  119327568UL, 241868962UL, 11347888UL,  211270175UL, 134916146UL, 111725133UL, 108180236UL, 9171648UL,
+    144043197UL, 226213856UL, 251245781UL, 165882294UL, 31971164UL,  133067891UL, 147207258UL, 203592553UL, 137083981UL,
+    100859428UL, 119571930UL, 247779108UL, 76017389UL,  127653710UL, 168335743UL, 8213005UL,   55759549UL,  168305834UL,
+    162742805UL, 227655025UL, 178637270UL, 51696659UL,  12680421UL,  77684606UL,  237050485UL, 228004942UL, 113728029UL,
+    250939922UL, 122288531UL, 170454772UL, 130030473UL, 212410372UL, 176924203UL, 129918876UL, 177573761UL, 104078576UL,
+    63519353UL,  263619745UL, 204151691UL, 242358474UL, 87840548UL,  163986154UL, 119721049UL, 130638093UL, 32765674UL,
+    109188334UL, 141063432UL, 199748299UL, 257480053UL, 248834206UL, 150417185UL, 255382808UL, 22074065UL,  22010217UL,
+    21374403UL,  29354738UL,  111338068UL, 58043440UL,  240431190UL, 154039456UL, 205899733UL, 197645533UL, 180673098UL,
+    196907426UL, 207388823UL, 164078071UL, 207245338UL, 264821454UL, 121819896UL, 128971978UL, 200657208UL, 147141508UL,
+    38274780UL,  196903112UL, 162334366UL, 33407944UL,  35759138UL,  179603978UL, 4984522UL,   215599112UL, 204338151UL,
+    205753787UL, 133050441UL, 131053315UL, 29600420UL,  236207297UL, 191056036UL, 264984887UL, 154924703UL, 241799004UL,
+    33791930UL,  14881129UL,  188393364UL, 65045992UL,  219973141UL, 26914639UL,  104044895UL, 217397934UL, 87988068UL,
+    48240869UL,  264795631UL, 198234415UL, 156335075UL, 77774738UL,  152265256UL, 239189108UL, 119330515UL, 59193162UL,
+    175234829UL, 120959111UL, 21147468UL,  230597815UL, 84282890UL,  24706651UL,  97018105UL,  235370644UL, 14446963UL,
+    188682100UL, 138688650UL, 12378269UL,  32679299UL,  61185082UL,  261857835UL, 585025UL,    227404405UL, 246915383UL,
+    196817552UL, 36775469UL,  250870713UL, 23698819UL,  135287126UL, 143865724UL, 101118524UL, 184970001UL, 223779289UL,
+    174902778UL, 252405630UL, 78810998UL,  190082775UL, 110386901UL, 5884714UL,   222364416UL, 62762136UL,  244265973UL,
+    207195000UL, 188618493UL, 139663166UL, 158504221UL, 134095841UL, 57202353UL,  183231191UL, 15154094UL,  64737205UL,
+    16600851UL,  237373903UL, 265330787UL, 33471859UL,  123353194UL, 235915813UL, 47860415UL,  155600007UL, 53295655UL,
+    221389675UL, 82795666UL,  29794851UL,  174613790UL, 93485185UL,  84087998UL,  59040229UL,  191917303UL, 52744800UL,
+    187806145UL, 230498280UL, 210456141UL, 258410798UL, 267033406UL, 122099643UL, 29165148UL,  182653643UL, 206966195UL,
+    194017145UL, 193679224UL, 158492110UL, 101617469UL, 155092710UL, 1109721UL,   242905798UL, 144046123UL, 129516180UL,
+    27351915UL,  112442142UL, 211335396UL, 143829750UL, 132203684UL, 205359569UL, 184926171UL, 187730873UL, 236665745UL,
+    2988303UL,   163514847UL, 100992323UL, 3299691UL,   208212070UL, 1901340UL,   88547524UL,  74117938UL,  98296570UL,
+    19900008UL,  49367170UL,  240629869UL, 8902877UL,   2016120UL,   115073326UL, 182927016UL, 97368719UL,  253995276UL,
+    91943779UL,  105108098UL, 150176885UL, 263442898UL, 157636591UL, 140754106UL, 138294642UL, 6464808UL,   96779908UL,
+    254580948UL, 208354849UL, 81209388UL,  233753315UL, 9383555UL,   111006246UL, 152773051UL, 209298481UL, 46521770UL,
+    1638772UL,   121528410UL, 103840684UL, 357867UL,    219876172UL, 45567392UL,  178173128UL, 29728574UL,  101230403UL,
+    60584127UL,  42715761UL,  241224950UL, 83396300UL,  172395833UL, 82710871UL,  148517193UL, 113081148UL, 52081438UL,
+    61874873UL,  88243245UL,  107470935UL, 119858UL,    230275967UL, 25308207UL,  179247891UL, 187872815UL, 29109660UL,
+    39292550UL,  224643951UL, 45848727UL,  165909795UL, 159924034UL, 240462112UL, 5766196UL,   267977285UL, 4367001UL,
+    145539478UL, 254511172UL, 157516323UL, 13564303UL,  170770231UL, 254860003UL, 62814097UL,  108455251UL, 221613598UL,
+    114774384UL, 112272222UL, 197597488UL, 157059137UL, 261317505UL, 219404184UL, 88985757UL,  230731628UL, 59357386UL,
+    206231495UL, 66498087UL,  204242281UL, 253800221UL, 240082630UL, 257156990UL, 60408966UL,  191906571UL, 99724095UL,
+    119058637UL, 116962383UL, 61099569UL,  75806023UL,  207049342UL, 181533238UL, 88848743UL,  233193032UL, 259830176UL,
+    244538594UL, 251825021UL, 212616885UL, 51252514UL,  155957235UL, 4823002UL,   257701969UL, 130322609UL, 65572977UL,
+    212067835UL, 108568979UL, 142646045UL, 17220768UL,  231672371UL, 124842975UL, 55626977UL,  214919748UL, 125924696UL,
+    251254390UL, 76960501UL,  139097245UL, 54646867UL,  183778450UL, 34743594UL,  101496976UL, 118991700UL, 107355981UL,
+    151738653UL, 242664583UL, 2844710UL,   108566900UL, 22279040UL,  198936483UL, 138723025UL, 224936407UL, 102949335UL,
+    93501586UL,  28234792UL,  229356086UL, 166751343UL, 36179092UL,  162441291UL, 226659165UL, 58911798UL,  253677190UL,
+    174239353UL, 48787743UL,  226713007UL, 57043335UL,  170004043UL, 219856043UL, 220786469UL, 8317355UL,   180025186UL,
+    184357167UL, 150254315UL, 81243168UL,  203152868UL, 217406648UL, 208351352UL, 143685398UL, 164446065UL, 178440253UL,
+    202934176UL, 59462104UL,  92916758UL,  250956821UL, 129009106UL, 176433810UL, 220762588UL, 48779734UL,  16104103UL,
+    216232380UL, 136847669UL, 174816203UL, 228819137UL, 132080292UL, 36419895UL,  244335711UL, 221236943UL, 264312327UL,
+    41596033UL,  164316222UL, 183796591UL, 171356738UL, 153686378UL, 61491772UL,  96426361UL,  199770263UL, 25100012UL,
+    243876117UL, 109494452UL, 268326267UL, 125025493UL, 198578030UL, 64138403UL,  151263220UL, 44539444UL,  178388906UL,
+    131835133UL, 54250956UL,  263779820UL, 192230064UL, 73377378UL,  72379642UL,  240889393UL, 247159617UL, 213025972UL,
+    70744685UL,  163063816UL, 257865540UL, 204358249UL, 22944461UL,  162103754UL, 243121602UL, 87456173UL,  174550866UL,
+    111213766UL, 122110017UL, 78609006UL,  78744461UL,  199594349UL, 201238271UL, 226572492UL, 12572649UL,  175023673UL,
+    37392350UL,  234731488UL, 191403412UL, 46635344UL,  145033094UL, 21231602UL,  87497595UL,  242397630UL, 111776069UL,
+    252859683UL, 66464824UL,  156815431UL, 53364153UL,  156318348UL, 18921719UL,  205739636UL,
+};
+uint32_t rand_arr_29_b29_w32_arr[1024] = {
+    451412267UL, 275079873UL, 449113719UL, 389760076UL, 302411971UL, 269747306UL, 517157729UL, 411039607UL, 390517765UL,
+    147802995UL, 222462165UL, 240834195UL, 36797089UL,  263280536UL, 381707458UL, 32239867UL,  218768232UL, 464792393UL,
+    294457611UL, 486405852UL, 33807091UL,  328098303UL, 284864773UL, 12618589UL,  156415875UL, 348869743UL, 343089055UL,
+    492913161UL, 133775082UL, 317578588UL, 7488520UL,   347047687UL, 521493310UL, 421399421UL, 345276462UL, 495162332UL,
+    268981765UL, 255316148UL, 499769017UL, 520797094UL, 76014755UL,  505798069UL, 41140314UL,  212559328UL, 138424541UL,
+    406706243UL, 389077055UL, 175495595UL, 365818057UL, 320770644UL, 250187104UL, 375294488UL, 512282576UL, 29338491UL,
+    437795197UL, 335980308UL, 518700256UL, 188347988UL, 203906490UL, 125100169UL, 95784160UL,  53334809UL,  13214900UL,
+    183557110UL, 99247048UL,  284559316UL, 159909190UL, 119480349UL, 438507270UL, 12280076UL,  44753491UL,  189009272UL,
+    307265020UL, 285608561UL, 493980812UL, 29744503UL,  195269362UL, 272523965UL, 260362105UL, 234832641UL, 427541578UL,
+    131856728UL, 433030038UL, 349422004UL, 115664490UL, 360997066UL, 267206274UL, 32640436UL,  474408439UL, 472448832UL,
+    411618380UL, 161422333UL, 155907991UL, 503737431UL, 379628161UL, 340463623UL, 21868213UL,  113865018UL, 124551181UL,
+    139055397UL, 197758974UL, 121315993UL, 77584442UL,  188499768UL, 462598799UL, 199173435UL, 34996193UL,  489399958UL,
+    290857935UL, 238959055UL, 487739677UL, 202412035UL, 1949683UL,   438343827UL, 388351670UL, 360403699UL, 36135440UL,
+    83979235UL,  132045150UL, 519262663UL, 168237978UL, 531572772UL, 195527681UL, 335153206UL, 293453894UL, 132781678UL,
+    51596895UL,  60242469UL,  437058760UL, 116607237UL, 11868858UL,  252546960UL, 209673133UL, 372329962UL, 306170930UL,
+    456177306UL, 428849236UL, 422250519UL, 121620671UL, 351168202UL, 118579630UL, 78759185UL,  78085235UL,  339840078UL,
+    212637338UL, 325102979UL, 530612657UL, 105521668UL, 340343136UL, 406790156UL, 103594114UL, 71823161UL,  522069207UL,
+    115335231UL, 168656437UL, 280941912UL, 168333505UL, 411571630UL, 53470233UL,  388575180UL, 128625390UL, 97867345UL,
+    354406749UL, 117274183UL, 381000054UL, 54540325UL,  210107560UL, 442993415UL, 205491114UL, 330730337UL, 35623661UL,
+    360377411UL, 44130923UL,  60851210UL,  471708474UL, 308086616UL, 448760203UL, 144349660UL, 443535062UL, 165320301UL,
+    231059662UL, 301636862UL, 370464495UL, 387375671UL, 217124390UL, 266155216UL, 189470323UL, 408341275UL, 241295322UL,
+    208317851UL, 329785503UL, 41434416UL,  354669288UL, 389553118UL, 388406064UL, 121885368UL, 115481787UL, 495493163UL,
+    250457885UL, 312570367UL, 468097826UL, 353442278UL, 487450920UL, 292741021UL, 105721951UL, 21735377UL,  377936943UL,
+    88222041UL,  377251741UL, 96467791UL,  492887536UL, 68141192UL,  154735772UL, 97332583UL,  413195020UL, 125499739UL,
+    525343106UL, 52434316UL,  337366588UL, 231294371UL, 25279464UL,  4138887UL,   3719078UL,   222118929UL, 37601540UL,
+    303999060UL, 376367116UL, 53621600UL,  439905005UL, 120620440UL, 358988367UL, 365568637UL, 13790782UL,  71809318UL,
+    488030191UL, 229739261UL, 291880197UL, 54964743UL,  451950805UL, 233159364UL, 203855835UL, 252169478UL, 19604463UL,
+    424007019UL, 476273413UL, 122479455UL, 50334214UL,  534054481UL, 444089885UL, 460104452UL, 12942810UL,  76975065UL,
+    487734542UL, 57990175UL,  2653132UL,   44608874UL,  258453797UL, 240509346UL, 300656183UL, 178887357UL, 143640820UL,
+    25158928UL,  360641295UL, 7517379UL,   489349468UL, 168389979UL, 191366991UL, 479957111UL, 339192960UL, 330255479UL,
+    51623694UL,  367828060UL, 233100183UL, 104010053UL, 125061182UL, 153343462UL, 266373920UL, 354411299UL, 1868435UL,
+    76516117UL,  448441079UL, 288219836UL, 218060569UL, 442404084UL, 81420742UL,  157211072UL, 359926729UL, 467337632UL,
+    409635197UL, 405788695UL, 488174848UL, 507785807UL, 355197616UL, 72733970UL,  510292291UL, 152872743UL, 256980917UL,
+    518955583UL, 433210670UL, 277003110UL, 437052209UL, 382618548UL, 143473948UL, 216394150UL, 38717897UL,  96823578UL,
+    113418767UL, 184613361UL, 445251580UL, 208764493UL, 400198303UL, 340969579UL, 333723777UL, 485249864UL, 304831018UL,
+    34997095UL,  364959066UL, 295179078UL, 407044340UL, 504527143UL, 100707837UL, 385662628UL, 19135533UL,  95706132UL,
+    194444047UL, 300038602UL, 106635958UL, 454345994UL, 49714279UL,  274657380UL, 119985808UL, 427559224UL, 200766125UL,
+    437074UL,    467776292UL, 72493912UL,  486688228UL, 507522287UL, 444336313UL, 373306638UL, 439990553UL, 461470450UL,
+    232525348UL, 493444190UL, 390832354UL, 178665104UL, 418299732UL, 506773889UL, 38265273UL,  87857217UL,  369088363UL,
+    234869772UL, 478627215UL, 446705505UL, 8366568UL,   338257020UL, 109680384UL, 115290538UL, 410562890UL, 404145175UL,
+    231249400UL, 171029345UL, 208638162UL, 28496905UL,  104205250UL, 479336926UL, 248537134UL, 320708935UL, 157767601UL,
+    471047096UL, 249947260UL, 451981512UL, 389538224UL, 67246149UL,  137970200UL, 44464393UL,  507061739UL, 63995400UL,
+    239975841UL, 45159992UL,  381282487UL, 437322047UL, 417461476UL, 13184998UL,  324002991UL, 115890108UL, 187786293UL,
+    150469376UL, 339680259UL, 348856524UL, 374947875UL, 178460791UL, 398137784UL, 438829427UL, 203458849UL, 139572279UL,
+    387034979UL, 337391232UL, 73120666UL,  342617117UL, 279352388UL, 346817973UL, 490758071UL, 458303552UL, 377171541UL,
+    521854106UL, 251034331UL, 385051669UL, 228059982UL, 304414625UL, 253259521UL, 344820904UL, 357443254UL, 470891143UL,
+    505020340UL, 514574887UL, 140462840UL, 297835725UL, 313244653UL, 96407317UL,  113913189UL, 457608259UL, 246357875UL,
+    185207213UL, 218030640UL, 524564548UL, 32863299UL,  76428349UL,  224512047UL, 446589482UL, 277404736UL, 137420240UL,
+    156961298UL, 530149753UL, 69302346UL,  62883470UL,  368902757UL, 91134036UL,  392956528UL, 89459012UL,  31715309UL,
+    144719673UL, 263234544UL, 154537649UL, 251956781UL, 140230665UL, 413699173UL, 33451157UL,  88651102UL,  218581853UL,
+    19676562UL,  140985677UL, 133565176UL, 123129001UL, 369402984UL, 109239464UL, 279152060UL, 99075520UL,  319406376UL,
+    131861392UL, 255776671UL, 450750136UL, 350303385UL, 302483376UL, 7515065UL,   396910793UL, 357621428UL, 146652033UL,
+    121147410UL, 282125297UL, 257326736UL, 521572026UL, 353372618UL, 461403471UL, 336321767UL, 266719268UL, 142912900UL,
+    44753147UL,  186205230UL, 59864502UL,  410445616UL, 337388167UL, 170518903UL, 328558110UL, 35967342UL,  218107557UL,
+    465915182UL, 223252786UL, 347698473UL, 461921032UL, 349717539UL, 7267512UL,   492854020UL, 172237653UL, 454737135UL,
+    356909534UL, 134869642UL, 269120791UL, 510756403UL, 476100751UL, 359468626UL, 275980897UL, 311225910UL, 140373489UL,
+    102270284UL, 271036934UL, 192222283UL, 260703694UL, 105789347UL, 491559279UL, 370424643UL, 536364117UL, 331783179UL,
+    434448136UL, 189323620UL, 264505499UL, 503401722UL, 500499421UL, 394486245UL, 409248790UL, 169488642UL, 517048538UL,
+    347402436UL, 168028885UL, 220265379UL, 435286622UL, 529346173UL, 156725482UL, 338963700UL, 16820112UL,  462674833UL,
+    139750563UL, 240057022UL, 114417454UL, 536318353UL, 5012556UL,   62170698UL,  248557392UL, 458059878UL, 415652333UL,
+    373792033UL, 326686957UL, 63839035UL,  471608705UL, 292621907UL, 178037456UL, 205306584UL, 350713957UL, 80360735UL,
+    85555368UL,  63405735UL,  35301896UL,  64958566UL,  494661203UL, 447787845UL, 209798683UL, 423775163UL, 139715847UL,
+    210000731UL, 131270014UL, 212731868UL, 331893601UL, 469119766UL, 482746849UL, 184402210UL, 153059110UL, 245964820UL,
+    168117865UL, 245495349UL, 455095775UL, 513389824UL, 36754978UL,  191003004UL, 389777429UL, 239339197UL, 269148040UL,
+    518014928UL, 111735338UL, 371873901UL, 195117797UL, 88869913UL,  285958567UL, 257649427UL, 472590149UL, 25741075UL,
+    489790182UL, 444616748UL, 522842270UL, 73303507UL,  215945527UL, 531506281UL, 180587357UL, 359629130UL, 443487923UL,
+    465519863UL, 385596938UL, 454132408UL, 429953903UL, 487288672UL, 112315489UL, 43172892UL,  183956019UL, 270236323UL,
+    362252040UL, 69870186UL,  289984535UL, 261959248UL, 319647386UL, 414416528UL, 63915593UL,  281244309UL, 796248UL,
+    327302877UL, 263753368UL, 532508096UL, 427313197UL, 433165013UL, 207136998UL, 384984514UL, 280252417UL, 225136731UL,
+    183697322UL, 528017033UL, 351285978UL, 104045439UL, 319441837UL, 357534725UL, 7416528UL,   286166906UL, 285329365UL,
+    406493742UL, 252315107UL, 483911256UL, 7634353UL,   502504390UL, 480872342UL, 309165217UL, 311407741UL, 103707720UL,
+    67673178UL,  427531942UL, 12195976UL,  509663812UL, 226823695UL, 16443669UL,  166654904UL, 318636155UL, 297834476UL,
+    535476319UL, 287894498UL, 288692027UL, 350829223UL, 440745047UL, 437350792UL, 163248494UL, 16789843UL,  120020976UL,
+    368609152UL, 140000428UL, 186680887UL, 140578234UL, 326259707UL, 30922046UL,  427157919UL, 5669309UL,   292918452UL,
+    504022223UL, 330764223UL, 60500856UL,  100719988UL, 278137074UL, 159031255UL, 408292291UL, 182466475UL, 493556038UL,
+    60859116UL,  261611371UL, 91887427UL,  417345870UL, 475614631UL, 309892367UL, 440229414UL, 138294316UL, 386464569UL,
+    334567301UL, 277255696UL, 371743601UL, 157043710UL, 526755882UL, 260856032UL, 279246157UL, 324353726UL, 245153796UL,
+    369786767UL, 210120130UL, 511515214UL, 172823799UL, 246620560UL, 57695241UL,  70891956UL,  243855348UL, 346514969UL,
+    51651126UL,  483355836UL, 482055344UL, 395261528UL, 571184UL,    406024296UL, 311287678UL, 349037819UL, 290268740UL,
+    141953833UL, 443085250UL, 318277102UL, 91431897UL,  347123832UL, 52843513UL,  372343700UL, 471529126UL, 275710636UL,
+    49790741UL,  535880231UL, 223545320UL, 493777493UL, 463087552UL, 408737439UL, 456297657UL, 322503139UL, 271330321UL,
+    349338939UL, 143949972UL, 471692976UL, 405632941UL, 520370275UL, 31820382UL,  383417317UL, 283006862UL, 36704373UL,
+    462456249UL, 166267380UL, 353835493UL, 48383987UL,  429249035UL, 97713826UL,  492415272UL, 108113133UL, 113859952UL,
+    489369203UL, 344834207UL, 230945644UL, 21976390UL,  154280234UL, 125262779UL, 527762080UL, 311282180UL, 63166273UL,
+    59995190UL,  496054623UL, 527208686UL, 307319187UL, 535865321UL, 224913172UL, 518914580UL, 183787550UL, 35480917UL,
+    422521075UL, 217743708UL, 498711870UL, 83202482UL,  517111144UL, 131980050UL, 475283331UL, 171469793UL, 377848514UL,
+    263093522UL, 463487845UL, 363318851UL, 429485019UL, 288215399UL, 475322909UL, 169517389UL, 168878500UL, 261969479UL,
+    365265561UL, 136305935UL, 32033324UL,  121899550UL, 268397533UL, 235108689UL, 520129511UL, 423109057UL, 417533183UL,
+    314865950UL, 138503699UL, 457594255UL, 282083049UL, 12592983UL,  55417306UL,  524709289UL, 78588572UL,  486354337UL,
+    356195837UL, 200657787UL, 7383178UL,   510762100UL, 400949634UL, 523622537UL, 454118275UL, 514903803UL, 361876425UL,
+    43098564UL,  486968993UL, 487855910UL, 506613306UL, 501480442UL, 198189560UL, 76224570UL,  128891292UL, 486345054UL,
+    224087374UL, 18022265UL,  429989819UL, 60824294UL,  243478417UL, 472364271UL, 391984232UL, 329530973UL, 303053818UL,
+    107166934UL, 253503628UL, 323317273UL, 93296538UL,  284352448UL, 114977653UL, 314264754UL, 358896188UL, 36041993UL,
+    471837833UL, 473728655UL, 439438499UL, 525264175UL, 499104996UL, 506121905UL, 169676282UL, 20415704UL,  438326034UL,
+    504763326UL, 256733831UL, 12062705UL,  133715946UL, 407258905UL, 420477064UL, 212332997UL, 28069303UL,  148166878UL,
+    335362643UL, 3688601UL,   503821269UL, 376331747UL, 358899642UL, 62825904UL,  375761959UL, 528094396UL, 118993678UL,
+    379573146UL, 150638351UL, 259111305UL, 527970848UL, 271543011UL, 49448706UL,  516810535UL, 149270906UL, 53329646UL,
+    336089499UL, 371587155UL, 498465260UL, 334308738UL, 157207922UL, 339676242UL, 312470535UL, 456001876UL, 33820063UL,
+    29730408UL,  505726970UL, 6704481UL,   52248901UL,  113869355UL, 126015721UL, 196697756UL, 229021152UL, 379085252UL,
+    166015023UL, 208129567UL, 533527011UL, 37179469UL,  417189470UL, 53258490UL,  438770838UL, 246179068UL, 109079090UL,
+    273420731UL, 88757012UL,  135411226UL, 415738531UL, 530813375UL, 513329610UL, 178675488UL, 344623355UL, 216135634UL,
+    215215572UL, 457218600UL, 369059460UL, 20030706UL,  360864145UL, 349548995UL, 288133853UL, 463718601UL, 146477239UL,
+    416425727UL, 341398677UL, 440233792UL, 478752845UL, 226010244UL, 43494528UL,  201153753UL, 28641455UL,  201667533UL,
+    464040504UL, 103719615UL, 407061533UL, 309239853UL, 36547634UL,  418303800UL, 117939047UL, 159327141UL, 405092177UL,
+    399000195UL, 446519657UL, 500613928UL, 100414610UL, 42548435UL,  203074529UL, 64486593UL,  350300958UL, 384056468UL,
+    329065045UL, 245654946UL, 254922888UL, 498134639UL, 529400910UL, 382077009UL, 123213476UL, 94963563UL,  47884159UL,
+    85791944UL,  252621769UL, 455946620UL, 3654099UL,   499007694UL, 314400091UL, 367050949UL, 286917882UL, 251381258UL,
+    383805538UL, 288617653UL, 32086919UL,  284827027UL, 58623743UL,  373337198UL, 281372488UL, 184073550UL, 287315130UL,
+    98575134UL,  320096940UL, 380440499UL, 534767772UL, 483031891UL, 147821764UL, 391864211UL, 344245791UL, 490323229UL,
+    255383515UL, 170707482UL, 400137433UL, 318200777UL, 220651817UL, 143518359UL, 13752375UL,  523069043UL, 45150798UL,
+    12243053UL,  360733407UL, 185395517UL, 86690659UL,  414191855UL, 166939637UL, 298838420UL, 320863161UL, 53700770UL,
+    323473828UL, 327129066UL, 392702138UL, 296807315UL, 64856536UL,  227015936UL, 301258118UL, 269472120UL, 504068951UL,
+    225153120UL, 473484266UL, 76486005UL,  455554663UL, 137973678UL, 24952930UL,  50611764UL,  155626938UL, 248563303UL,
+    263886292UL, 159800103UL, 398853309UL, 177050307UL, 399921780UL, 356963342UL, 223831812UL,
+};
+uint32_t rand_arr_30_b30_w32_arr[1024] = {
+    807940519UL,  538077809UL,  563164143UL,  581258844UL,  861544604UL,  628504679UL,  699444984UL,  776175882UL,
+    592455546UL,  229713776UL,  568909375UL,  463263951UL,  557408711UL,  972558479UL,  364911996UL,  429289821UL,
+    926027821UL,  75512096UL,   513036998UL,  103065933UL,  736562422UL,  600261771UL,  899215483UL,  386474132UL,
+    279281496UL,  436490642UL,  969516126UL,  474444011UL,  886312182UL,  25823610UL,   847486627UL,  77412782UL,
+    1046863314UL, 651324438UL,  672482533UL,  80950471UL,   875985798UL,  705495088UL,  776203579UL,  435385739UL,
+    311273166UL,  822323532UL,  175714606UL,  314077096UL,  800591045UL,  914234038UL,  1013276696UL, 760743235UL,
+    618140012UL,  939843583UL,  799070582UL,  310696523UL,  18677554UL,   232701118UL,  1057381933UL, 860970939UL,
+    437587413UL,  243525277UL,  643209433UL,  43626682UL,   829918187UL,  1040362467UL, 439375128UL,  369017686UL,
+    633400943UL,  430956124UL,  470617178UL,  103155516UL,  769835468UL,  146554066UL,  984576808UL,  22795817UL,
+    460019739UL,  240088125UL,  377209766UL,  1060043542UL, 445007842UL,  902238495UL,  513607414UL,  626668729UL,
+    24214007UL,   320378578UL,  943958202UL,  1008163533UL, 673790657UL,  885405919UL,  37402426UL,   832105856UL,
+    625803996UL,  374373504UL,  568084471UL,  733259780UL,  259799098UL,  303142682UL,  382558864UL,  680104436UL,
+    254872808UL,  241893767UL,  521855938UL,  556848935UL,  911668356UL,  877649483UL,  818485970UL,  1021595889UL,
+    715808584UL,  1012019951UL, 856903165UL,  43844120UL,   222240682UL,  711209327UL,  165597006UL,  29516878UL,
+    847107340UL,  8155497UL,    443242294UL,  153772046UL,  552606836UL,  50049653UL,   967624233UL,  152514435UL,
+    1024821034UL, 1030629141UL, 60639652UL,   922039240UL,  355038196UL,  62103959UL,   52002146UL,   953419680UL,
+    189120966UL,  659198492UL,  632839437UL,  149760863UL,  916460121UL,  1012087670UL, 1053153534UL, 417866138UL,
+    28980933UL,   233099610UL,  912212889UL,  823331334UL,  390103613UL,  353300339UL,  415674686UL,  589724965UL,
+    353850079UL,  784779582UL,  223734821UL,  886838577UL,  125656887UL,  51055760UL,   805941815UL,  383324771UL,
+    285420630UL,  281250602UL,  549274611UL,  822242714UL,  453584706UL,  739994994UL,  433940425UL,  253283932UL,
+    708442770UL,  403005944UL,  237295296UL,  46452327UL,   237261459UL,  152403395UL,  526896325UL,  699437091UL,
+    852983322UL,  53647873UL,   741639811UL,  338168384UL,  65851106UL,   41133057UL,   802605497UL,  504502017UL,
+    338209305UL,  543150784UL,  165488559UL,  202339243UL,  553987881UL,  815634969UL,  742661937UL,  513841484UL,
+    301486476UL,  1056018002UL, 573181936UL,  969376622UL,  741173497UL,  988052851UL,  338731734UL,  699073899UL,
+    254875475UL,  374675049UL,  545234629UL,  541199808UL,  816498190UL,  894964099UL,  808819842UL,  224872575UL,
+    514985726UL,  226998056UL,  658082378UL,  252171946UL,  932656243UL,  768644974UL,  1031771739UL, 641427106UL,
+    160168494UL,  881278481UL,  475370917UL,  749927478UL,  953584059UL,  1037193927UL, 805208274UL,  496674213UL,
+    858781456UL,  812780507UL,  891334032UL,  173139728UL,  688237921UL,  487446440UL,  565290498UL,  447737288UL,
+    873675273UL,  351015105UL,  492561209UL,  251158874UL,  99445499UL,   334625575UL,  325655646UL,  350819917UL,
+    149475516UL,  471099995UL,  654696613UL,  566231033UL,  753635544UL,  544156317UL,  445694404UL,  394307793UL,
+    811290929UL,  974275956UL,  107043324UL,  742353637UL,  909572204UL,  357557222UL,  1044596618UL, 239167610UL,
+    978925924UL,  642367995UL,  433530764UL,  274529154UL,  525882523UL,  243036081UL,  147901949UL,  891730881UL,
+    246257517UL,  400225295UL,  487018821UL,  662106118UL,  790471282UL,  952047796UL,  974609796UL,  241445098UL,
+    239169704UL,  161094818UL,  983416674UL,  105598737UL,  502738165UL,  431127163UL,  356502242UL,  454326874UL,
+    901274390UL,  237750677UL,  262729115UL,  17907442UL,   865679431UL,  388417059UL,  1023688248UL, 763828439UL,
+    755740786UL,  695694466UL,  678083246UL,  400605585UL,  59205629UL,   1024450004UL, 524457767UL,  729286297UL,
+    520144034UL,  496747204UL,  650468849UL,  120452119UL,  310269399UL,  1070480201UL, 293347881UL,  394296866UL,
+    120895038UL,  171516360UL,  707885417UL,  43579729UL,   153449365UL,  690421650UL,  549736483UL,  892098726UL,
+    148486355UL,  975557784UL,  409002641UL,  984340398UL,  784613711UL,  652245831UL,  588919640UL,  65078075UL,
+    159878125UL,  138029085UL,  755984627UL,  983535224UL,  981993724UL,  300837403UL,  26673019UL,   505455319UL,
+    957281542UL,  421908196UL,  562979259UL,  954790776UL,  826921719UL,  597133798UL,  521866208UL,  255992505UL,
+    139547393UL,  616942864UL,  961065757UL,  818873973UL,  599682788UL,  4351276UL,    304378213UL,  971139126UL,
+    386284286UL,  604014884UL,  981247567UL,  660021636UL,  201983245UL,  631514761UL,  526078212UL,  776033252UL,
+    821245604UL,  1012716263UL, 336799977UL,  295739339UL,  678193499UL,  299932190UL,  682701069UL,  1064529988UL,
+    528203962UL,  464891646UL,  343098168UL,  250983784UL,  413877165UL,  546576675UL,  111299577UL,  594102348UL,
+    246753472UL,  597422116UL,  981293724UL,  31248958UL,   392173569UL,  132306052UL,  167310860UL,  512731875UL,
+    910894668UL,  402689176UL,  29799289UL,   464212573UL,  1009217271UL, 772392555UL,  265850925UL,  761856936UL,
+    648986164UL,  487389786UL,  286719914UL,  492325323UL,  769016413UL,  281594382UL,  499334418UL,  478773448UL,
+    941271332UL,  917050793UL,  930565143UL,  610173898UL,  933431412UL,  59585861UL,   282295815UL,  298579863UL,
+    227845346UL,  1014359357UL, 188146735UL,  908148274UL,  1006646758UL, 393022303UL,  211350699UL,  135850369UL,
+    95049990UL,   1042510352UL, 775811308UL,  451023729UL,  175163680UL,  787727890UL,  252463198UL,  1053567724UL,
+    698893561UL,  685939274UL,  262589629UL,  461696882UL,  699029810UL,  698790097UL,  809593971UL,  259046726UL,
+    786982405UL,  204230626UL,  818561291UL,  122762262UL,  803571958UL,  719404110UL,  891106454UL,  169958744UL,
+    948352091UL,  193444586UL,  403132539UL,  219871242UL,  277749249UL,  307816922UL,  1027599871UL, 48677329UL,
+    886706077UL,  927805321UL,  962833441UL,  403866616UL,  623831959UL,  880385550UL,  149202956UL,  576163493UL,
+    1007915358UL, 522834156UL,  347767714UL,  179553008UL,  401982UL,     961942180UL,  718266641UL,  1008885895UL,
+    375534661UL,  86024028UL,   1018882804UL, 1039098296UL, 566007353UL,  447769928UL,  440203765UL,  12331064UL,
+    352711164UL,  613853371UL,  592702818UL,  901513444UL,  301490078UL,  177591457UL,  891489621UL,  868360015UL,
+    141066399UL,  449217750UL,  790165851UL,  138833326UL,  197963007UL,  146993280UL,  872879832UL,  288401626UL,
+    288471616UL,  693580469UL,  443231955UL,  306087444UL,  73520180UL,   88412903UL,   361936739UL,  794455318UL,
+    471984408UL,  488581831UL,  918104613UL,  135461528UL,  252153784UL,  948183020UL,  138351567UL,  355882152UL,
+    170015672UL,  591029981UL,  489807899UL,  725009025UL,  817145017UL,  111222255UL,  891606995UL,  46373673UL,
+    281044232UL,  916167502UL,  593390488UL,  950852059UL,  23932432UL,   599496145UL,  87699320UL,   171696936UL,
+    795714927UL,  204021416UL,  223549707UL,  639675920UL,  449873579UL,  97502317UL,   370526789UL,  298724771UL,
+    871994024UL,  736477322UL,  965992437UL,  713365474UL,  108436166UL,  483800830UL,  418176518UL,  183811354UL,
+    32336364UL,   45922609UL,   363888009UL,  943915415UL,  824875980UL,  623643650UL,  877065238UL,  637733176UL,
+    310768951UL,  373563255UL,  624431480UL,  549632694UL,  829235116UL,  620209180UL,  894986538UL,  582259986UL,
+    1023583569UL, 81200516UL,   354687045UL,  887021738UL,  848308322UL,  616334523UL,  8109543UL,    855043518UL,
+    686578633UL,  1045664641UL, 1041125034UL, 5217125UL,    638249344UL,  135839310UL,  458277110UL,  755459831UL,
+    725357863UL,  781161738UL,  673344313UL,  158185353UL,  56609384UL,   995517499UL,  253133351UL,  983724926UL,
+    895863803UL,  138730811UL,  251364618UL,  657731417UL,  994018086UL,  800690609UL,  626934356UL,  1056096815UL,
+    367929504UL,  345293109UL,  1027844996UL, 1031816446UL, 868141846UL,  722984050UL,  405494496UL,  276823628UL,
+    227126769UL,  236071056UL,  183082046UL,  157469549UL,  640253836UL,  634811010UL,  603172902UL,  565102840UL,
+    1005654232UL, 753290910UL,  126212116UL,  460380185UL,  670150654UL,  836467780UL,  593291107UL,  1029870381UL,
+    667531189UL,  766531071UL,  243111428UL,  207836705UL,  640215900UL,  1028703026UL, 138126574UL,  137093107UL,
+    590821412UL,  647080672UL,  829780672UL,  335895807UL,  589281573UL,  651890480UL,  623079776UL,  76542723UL,
+    386014504UL,  611897672UL,  59137511UL,   343311101UL,  494633248UL,  186210217UL,  541700116UL,  20968450UL,
+    734087438UL,  311382436UL,  175670085UL,  180825446UL,  135215026UL,  135772856UL,  590340626UL,  153131082UL,
+    43259689UL,   1067362287UL, 972318370UL,  1014539813UL, 466120151UL,  769306479UL,  765941245UL,  384346019UL,
+    519329581UL,  221906915UL,  182400910UL,  21712183UL,   684736878UL,  485636620UL,  277565501UL,  60356684UL,
+    592139048UL,  1014161644UL, 426246368UL,  727461695UL,  542504177UL,  985625192UL,  652000222UL,  537139092UL,
+    248655865UL,  8231578UL,    802320889UL,  806032267UL,  172013591UL,  883186178UL,  495725083UL,  55454116UL,
+    801513742UL,  581819739UL,  19856088UL,   569249431UL,  386363010UL,  983864773UL,  717153240UL,  21883129UL,
+    1031884246UL, 734838062UL,  1016217015UL, 197483886UL,  281549552UL,  649399699UL,  692979813UL,  985284236UL,
+    294165022UL,  93319676UL,   283730818UL,  87713563UL,   647161718UL,  872885855UL,  940055078UL,  1058372985UL,
+    874720832UL,  1057447535UL, 758647045UL,  837043053UL,  653634114UL,  842905745UL,  823811563UL,  1040586204UL,
+    433882202UL,  582177645UL,  952053448UL,  609590920UL,  38619898UL,   339806476UL,  674735295UL,  130688428UL,
+    1067896206UL, 478312917UL,  295339518UL,  167850255UL,  924472643UL,  980679181UL,  248693912UL,  754636229UL,
+    277817053UL,  759554083UL,  640819962UL,  437837809UL,  831051111UL,  541323601UL,  126512623UL,  827798489UL,
+    53944506UL,   565330734UL,  957144194UL,  39449860UL,   132439111UL,  595694662UL,  327261303UL,  998530485UL,
+    570514105UL,  235184649UL,  46455727UL,   368224766UL,  222240601UL,  1023509021UL, 948857863UL,  493229011UL,
+    249766744UL,  559691263UL,  647975756UL,  120158316UL,  545953115UL,  151007459UL,  293639927UL,  476961503UL,
+    76618947UL,   737111805UL,  486966915UL,  874675752UL,  66372011UL,   294935735UL,  790651962UL,  332595416UL,
+    923387500UL,  657730849UL,  859440225UL,  163209126UL,  138491251UL,  500053286UL,  279855670UL,  969689196UL,
+    378219358UL,  726696354UL,  51869853UL,   514775271UL,  986606374UL,  917054795UL,  962602798UL,  874825691UL,
+    304941307UL,  673598034UL,  164393359UL,  905757370UL,  84345226UL,   584939534UL,  603533799UL,  450695100UL,
+    769380791UL,  526506409UL,  888673388UL,  183472975UL,  426243335UL,  925589177UL,  825674161UL,  3378279UL,
+    564133298UL,  951977857UL,  249448652UL,  692156442UL,  480264937UL,  157950454UL,  874917188UL,  446416485UL,
+    943720802UL,  623947882UL,  854952752UL,  78768387UL,   797174281UL,  776739579UL,  661277803UL,  136317060UL,
+    903285036UL,  883100668UL,  826515764UL,  450854770UL,  761376532UL,  660168452UL,  311954954UL,  648390859UL,
+    217977057UL,  740828766UL,  120340988UL,  514612686UL,  789746077UL,  870737215UL,  870903519UL,  824134840UL,
+    758142510UL,  868865239UL,  1030508039UL, 1024508411UL, 314515391UL,  1023733307UL, 34761971UL,   66610115UL,
+    709728557UL,  925254226UL,  374443817UL,  373467815UL,  566032516UL,  955770323UL,  1016034985UL, 436601228UL,
+    702460727UL,  235201955UL,  637824157UL,  972402511UL,  537521019UL,  767144001UL,  890855486UL,  943018185UL,
+    790239448UL,  579125379UL,  516683133UL,  794481625UL,  533314845UL,  690040042UL,  145815983UL,  104290604UL,
+    514031716UL,  1070730460UL, 23597029UL,   737612467UL,  95809135UL,   1056061218UL, 668333713UL,  303655377UL,
+    155035199UL,  884428451UL,  933181319UL,  40643271UL,   12156137UL,   405020374UL,  482736881UL,  883718975UL,
+    549222603UL,  526860717UL,  175710990UL,  846924255UL,  870575428UL,  298598280UL,  831041165UL,  912695600UL,
+    394425434UL,  409399044UL,  1031888191UL, 1002539736UL, 812703222UL,  390832485UL,  1007417469UL, 738985324UL,
+    164640149UL,  291731629UL,  937711012UL,  931800619UL,  93759191UL,   174796881UL,  819019691UL,  798005058UL,
+    517638979UL,  396233389UL,  681378285UL,  146780522UL,  740594644UL,  211914875UL,  74003309UL,   598168630UL,
+    364642369UL,  29005289UL,   809405332UL,  198290116UL,  142556123UL,  981984758UL,  914760481UL,  199974006UL,
+    826614320UL,  1037917051UL, 778379728UL,  859160003UL,  1016199436UL, 83699787UL,   1040289900UL, 521422285UL,
+    148982121UL,  817608706UL,  165494927UL,  408872099UL,  938571881UL,  972427549UL,  454891168UL,  608916348UL,
+    231628188UL,  555556488UL,  232524166UL,  122702895UL,  260195404UL,  363945666UL,  523028677UL,  779210332UL,
+    748029850UL,  834325086UL,  633663958UL,  1004052029UL, 424585577UL,  901686526UL,  933870241UL,  500257747UL,
+    592698685UL,  762904852UL,  946454561UL,  72443127UL,   193458894UL,  389198965UL,  851348553UL,  128589609UL,
+    1046993651UL, 663312439UL,  46112796UL,   457525243UL,  967841660UL,  928635642UL,  649132529UL,  890795050UL,
+    131712788UL,  889145894UL,  403454934UL,  949910060UL,  174664643UL,  560308502UL,  551689020UL,  450166649UL,
+    606788437UL,  181890390UL,  15201730UL,   463918956UL,  124651997UL,  698975269UL,  869784932UL,  96186794UL,
+    17469089UL,   253830213UL,  229370772UL,  864147824UL,  155718854UL,  910970490UL,  1003341049UL, 982766750UL,
+    319827694UL,  14507793UL,   664433977UL,  616732236UL,  889061312UL,  784704412UL,  933063107UL,  569815294UL,
+    623420262UL,  169764504UL,  345722131UL,  727566248UL,  74086045UL,   909341134UL,  841649135UL,  674550844UL,
+    1020052387UL, 997144734UL,  222196441UL,  708354832UL,  1061932869UL, 63968561UL,   1064369147UL, 938280911UL,
+    562581221UL,  969087391UL,  76015393UL,   207191576UL,  145659324UL,  698518711UL,  21168621UL,   867866221UL,
+    226584799UL,  683252791UL,  83726360UL,   896063035UL,  177479255UL,  989675765UL,  618323947UL,  791437513UL,
+    999889993UL,  1027106996UL, 340088796UL,  509835334UL,  636562329UL,  1046147410UL, 73779620UL,   47287996UL,
+    1022858775UL, 610423466UL,  836899081UL,  827624575UL,  1024591810UL, 424931028UL,  679003992UL,  356097572UL,
+};
+uint32_t rand_arr_31_b31_w32_arr[1024] = {
+    1684946915UL, 176666303UL,  243658860UL,  325052151UL,  853118448UL,  66803180UL,   1411787356UL, 734466805UL,
+    126980350UL,  1002192848UL, 638391211UL,  1835781798UL, 295168083UL,  817712660UL,  889725607UL,  1613811058UL,
+    1465042349UL, 1574833729UL, 787624608UL,  1502946551UL, 1296661079UL, 151688713UL,  48677795UL,   1236956610UL,
+    1263993671UL, 1418497101UL, 751687780UL,  103184683UL,  1279852223UL, 374482301UL,  1983529063UL, 1962696070UL,
+    616864721UL,  1830346575UL, 23198888UL,   1401033893UL, 679410638UL,  601827096UL,  949170282UL,  1382667103UL,
+    316372496UL,  1887146321UL, 957825403UL,  1884204852UL, 482518352UL,  414521397UL,  1339546182UL, 2136253449UL,
+    1868567573UL, 248325249UL,  1879564873UL, 562885844UL,  1180294991UL, 1332720640UL, 268363249UL,  297371644UL,
+    2030080065UL, 1068632987UL, 306737616UL,  861826139UL,  1232580724UL, 959461685UL,  1331982049UL, 285683160UL,
+    476232396UL,  361874979UL,  331568664UL,  590828501UL,  1653463724UL, 1998855758UL, 1930057952UL, 1271460783UL,
+    1694999115UL, 1773109976UL, 1178796820UL, 339458433UL,  1062667532UL, 1164077984UL, 1491011428UL, 1987159177UL,
+    1196509667UL, 1659446858UL, 340122272UL,  1948376767UL, 284514145UL,  1474746697UL, 475759352UL,  1310779814UL,
+    252393778UL,  911466088UL,  1819649611UL, 1842436377UL, 1184464619UL, 1130046430UL, 1617343940UL, 792597855UL,
+    2062121628UL, 2001044767UL, 97036638UL,   1243164266UL, 254825135UL,  95456108UL,   828204905UL,  1667676920UL,
+    1034468037UL, 16485240UL,   1133958601UL, 1710788898UL, 1572425389UL, 415919171UL,  306430057UL,  1861239936UL,
+    1603772755UL, 709452588UL,  1027278352UL, 853572664UL,  744268706UL,  1125012413UL, 138126527UL,  2145350648UL,
+    340520821UL,  1806218262UL, 476638008UL,  1130466369UL, 1448949822UL, 1316381585UL, 1560689167UL, 805655399UL,
+    896109594UL,  1641641110UL, 1851058360UL, 1025177447UL, 1525235662UL, 1350947371UL, 602193984UL,  1991718278UL,
+    237698249UL,  1645279425UL, 838091398UL,  918925699UL,  2124927282UL, 59226746UL,   262205471UL,  309573035UL,
+    1896264941UL, 70835897UL,   1485587917UL, 353276480UL,  1760683746UL, 1780066094UL, 893223631UL,  1666675182UL,
+    1951579942UL, 1724044905UL, 29126245UL,   935204747UL,  1687711978UL, 522280349UL,  1765151764UL, 757282659UL,
+    1564655845UL, 860150243UL,  586434017UL,  833999430UL,  1093692297UL, 285744700UL,  1360914965UL, 1456388481UL,
+    2002798203UL, 1057625347UL, 1219294176UL, 577714135UL,  2014643866UL, 1107792373UL, 1745319531UL, 665573926UL,
+    1181325841UL, 349420381UL,  16593645UL,   1691035917UL, 346220228UL,  445977140UL,  1337080772UL, 2099178509UL,
+    188103950UL,  256476360UL,  2110971730UL, 685428688UL,  707071601UL,  432472201UL,  84581934UL,   629769387UL,
+    725481239UL,  1235159446UL, 1176886392UL, 306663720UL,  438884211UL,  1364549332UL, 1665504149UL, 898456801UL,
+    1371562511UL, 229313160UL,  1546235734UL, 56186442UL,   1095287331UL, 1999878478UL, 1327449893UL, 1769072116UL,
+    1560515366UL, 1704586139UL, 96242223UL,   515803887UL,  1247493836UL, 419095398UL,  1349014479UL, 603966031UL,
+    2020855446UL, 572022338UL,  597457733UL,  674201546UL,  1227981653UL, 234198371UL,  2024763032UL, 609809234UL,
+    23162224UL,   251442569UL,  679539659UL,  269343518UL,  1778488871UL, 390219885UL,  1651426306UL, 1745736629UL,
+    1860164744UL, 291083002UL,  1786582938UL, 340063057UL,  1879700105UL, 1173039881UL, 1127426458UL, 1114479383UL,
+    504267368UL,  2134822932UL, 1143156506UL, 1956837745UL, 791684633UL,  212129400UL,  1393730403UL, 2139221000UL,
+    202936072UL,  1441321065UL, 1418787297UL, 1852056172UL, 538792578UL,  157405207UL,  500104262UL,  1484877810UL,
+    507781228UL,  1581328458UL, 764246838UL,  1528407361UL, 558588279UL,  993108363UL,  1122502321UL, 1516290047UL,
+    1755563646UL, 20481107UL,   1869408165UL, 542486229UL,  1284734673UL, 510021841UL,  1580432909UL, 1052577267UL,
+    789025026UL,  1932766919UL, 1520081771UL, 1369955702UL, 1330981273UL, 930488149UL,  814910479UL,  541315553UL,
+    324203713UL,  2076401205UL, 584506899UL,  1226215771UL, 934959459UL,  1459838251UL, 163331290UL,  276603579UL,
+    1821038362UL, 1173749346UL, 226150686UL,  1093817201UL, 1563765970UL, 2064771838UL, 757899401UL,  1480023649UL,
+    1476776371UL, 249354797UL,  1266308844UL, 1767412221UL, 678364253UL,  1459961037UL, 1054555110UL, 1918306994UL,
+    1389280633UL, 216929086UL,  901765005UL,  651227266UL,  1645315441UL, 1757337047UL, 558756392UL,  1741213785UL,
+    909529275UL,  1089539447UL, 877584459UL,  1796338666UL, 1288414412UL, 1777946555UL, 1775382893UL, 565029343UL,
+    1812696450UL, 1971490631UL, 372287486UL,  733450809UL,  998684146UL,  1474940338UL, 596958791UL,  1481852855UL,
+    932140175UL,  1412987583UL, 2114075811UL, 744132384UL,  347985614UL,  1645358875UL, 1734307407UL, 88012779UL,
+    1880297523UL, 1138921223UL, 1680924628UL, 596460899UL,  1769804627UL, 913418740UL,  1019909978UL, 934181040UL,
+    1460683798UL, 567855724UL,  470218786UL,  229338798UL,  1794456756UL, 1429134096UL, 48610843UL,   1738464822UL,
+    1953614918UL, 870752011UL,  1917364952UL, 1219591289UL, 1274345052UL, 211219230UL,  638646675UL,  773061668UL,
+    1399112844UL, 1015777831UL, 2106087948UL, 1496141123UL, 841677362UL,  1371144044UL, 1091580381UL, 2035437642UL,
+    766760710UL,  1956590886UL, 824310895UL,  2006087798UL, 111293868UL,  581392959UL,  2097302124UL, 570114385UL,
+    679367325UL,  884056051UL,  1906783096UL, 972179727UL,  101186484UL,  971709561UL,  722377216UL,  1585626010UL,
+    1754534354UL, 1921258485UL, 278903847UL,  927098280UL,  1276441471UL, 193626799UL,  523945225UL,  1800395183UL,
+    233299831UL,  553616365UL,  444817911UL,  735564570UL,  1352501936UL, 1640239090UL, 567102713UL,  480499949UL,
+    473792815UL,  547204762UL,  210778125UL,  1502784374UL, 1860441234UL, 129612125UL,  1604920174UL, 1678153615UL,
+    298168545UL,  2074059543UL, 1668918219UL, 983573051UL,  1355457782UL, 745151883UL,  820436678UL,  77563937UL,
+    1385118628UL, 286039858UL,  130237194UL,  235028663UL,  2043425416UL, 1421256022UL, 720031470UL,  1721909553UL,
+    1803508920UL, 2100299216UL, 358029089UL,  209988263UL,  1939526159UL, 431550282UL,  761733377UL,  1446649893UL,
+    2034463376UL, 1967454040UL, 1642965372UL, 1231708311UL, 1697766675UL, 1446125323UL, 140164595UL,  1542746575UL,
+    2061108770UL, 660088248UL,  575217352UL,  271793092UL,  35359288UL,   2046996582UL, 34120436UL,   1755573637UL,
+    1839346236UL, 1834912608UL, 344714074UL,  367961716UL,  288489185UL,  922407761UL,  391203897UL,  297519661UL,
+    1750716699UL, 1540098398UL, 530153845UL,  855516188UL,  1940917627UL, 2096757968UL, 256040409UL,  1661438928UL,
+    1056657936UL, 494282864UL,  205517327UL,  1791852662UL, 1194586112UL, 1370003715UL, 1490202694UL, 2102240647UL,
+    1315770591UL, 2128109196UL, 1018323949UL, 1877686310UL, 120996597UL,  553540005UL,  1862708434UL, 1302167447UL,
+    31127859UL,   1972712222UL, 1057398034UL, 2003695359UL, 1308690409UL, 2085087219UL, 1655724404UL, 1208914636UL,
+    961390131UL,  1087436704UL, 775874652UL,  877320864UL,  727768882UL,  1108617375UL, 956177111UL,  1605901886UL,
+    1077478762UL, 1476926446UL, 1922909009UL, 1689815841UL, 958617075UL,  212852174UL,  980515599UL,  1759127887UL,
+    1885886074UL, 859379961UL,  2130325147UL, 494330617UL,  600804786UL,  499026410UL,  1525348746UL, 1359237735UL,
+    127007040UL,  886411665UL,  2029499238UL, 919628452UL,  28854331UL,   892935776UL,  200036066UL,  207507571UL,
+    288998624UL,  272707345UL,  1835568601UL, 1744989832UL, 306113572UL,  1141266136UL, 2126467683UL, 800038464UL,
+    1171854960UL, 1898479653UL, 1695209990UL, 1195652112UL, 182437135UL,  1756924142UL, 528175243UL,  1328447348UL,
+    1843320094UL, 1690229607UL, 1255826579UL, 765876015UL,  563363558UL,  1152168590UL, 1025601UL,    2061717444UL,
+    1833205641UL, 1774233148UL, 413615926UL,  1473382125UL, 1574072408UL, 508667074UL,  947299233UL,  13621151UL,
+    700128613UL,  1767665514UL, 1234899072UL, 910384826UL,  712924545UL,  357733826UL,  959645854UL,  200208249UL,
+    862941421UL,  1402381795UL, 638638196UL,  180091113UL,  1813214162UL, 2048132663UL, 1856273143UL, 1598828693UL,
+    439615458UL,  163993702UL,  1671416960UL, 520227341UL,  515379709UL,  343045855UL,  1269651276UL, 53910184UL,
+    1321852306UL, 918577639UL,  1660573268UL, 2064394942UL, 1854029492UL, 1155426396UL, 1266428507UL, 188471419UL,
+    1860857700UL, 1840808143UL, 751913834UL,  891973004UL,  306109483UL,  360307731UL,  635126346UL,  327755325UL,
+    1703586251UL, 1710149250UL, 1837024427UL, 712694659UL,  156373610UL,  1037332495UL, 56721225UL,   1672010193UL,
+    1282890578UL, 1353532064UL, 520478603UL,  1535011503UL, 1892279463UL, 1590803004UL, 1388521697UL, 352414586UL,
+    229744753UL,  1733062013UL, 947053528UL,  1583520810UL, 455464217UL,  1632574602UL, 435348556UL,  481520252UL,
+    867841429UL,  597919034UL,  1721807930UL, 1094462563UL, 724796218UL,  773166068UL,  1551141692UL, 1035054247UL,
+    1766080893UL, 308386708UL,  127393204UL,  797298114UL,  1278319544UL, 1228812972UL, 930530889UL,  1982703145UL,
+    968962236UL,  313150910UL,  1651528396UL, 161228488UL,  1375497462UL, 865036992UL,  1790693922UL, 1317420989UL,
+    1589803966UL, 546951287UL,  1767527453UL, 1607961648UL, 447890744UL,  1382636489UL, 1644411109UL, 145162907UL,
+    1243717216UL, 1919464837UL, 354801953UL,  237761503UL,  1055184309UL, 510464379UL,  163591626UL,  1739541718UL,
+    298608567UL,  387644870UL,  1753797967UL, 1406678174UL, 1901648744UL, 997700221UL,  762855573UL,  906287117UL,
+    1325161855UL, 839551576UL,  1224451167UL, 270747020UL,  1518750023UL, 1381124823UL, 990071494UL,  1491026827UL,
+    866495798UL,  1028145152UL, 1666198978UL, 170262645UL,  1905457502UL, 749530704UL,  108713976UL,  236760650UL,
+    1372700356UL, 632492068UL,  793587026UL,  168035064UL,  313062495UL,  133385591UL,  1607759710UL, 1012899534UL,
+    597061836UL,  35825489UL,   1124188921UL, 839436399UL,  2135720413UL, 1290052108UL, 1925188159UL, 2054856791UL,
+    1641029408UL, 1840452700UL, 1102829117UL, 1417105361UL, 1206866877UL, 506029826UL,  1332740718UL, 992843692UL,
+    1947469123UL, 1993963805UL, 141898710UL,  1361946660UL, 1580501290UL, 1370948685UL, 766311804UL,  995666919UL,
+    40617250UL,   2077689126UL, 1279771148UL, 409340831UL,  246906813UL,  1542815524UL, 1637293595UL, 843965418UL,
+    1529068177UL, 1452302170UL, 1973439050UL, 953045410UL,  1943553082UL, 430872582UL,  233579136UL,  1771785038UL,
+    1871613989UL, 40302392UL,   1694204653UL, 198726839UL,  1509178130UL, 733276148UL,  1012806057UL, 918679306UL,
+    926980362UL,  1483608448UL, 52671498UL,   1901356655UL, 1515201839UL, 1770473506UL, 529514128UL,  1827745466UL,
+    1085636825UL, 1023821645UL, 806205291UL,  414683315UL,  1278208168UL, 1027336046UL, 1729011964UL, 147995199UL,
+    1630261731UL, 556480080UL,  2023686108UL, 1530553334UL, 464907047UL,  684102473UL,  650041375UL,  585314369UL,
+    73341201UL,   778121172UL,  2139697636UL, 184192346UL,  114574068UL,  917421749UL,  133896643UL,  996064390UL,
+    2122415902UL, 919283879UL,  390959054UL,  422625503UL,  1498199151UL, 129981876UL,  1438959281UL, 1786224680UL,
+    1411377149UL, 108061473UL,  1716199816UL, 192742162UL,  2137205003UL, 2134056038UL, 1039839749UL, 359007741UL,
+    242050356UL,  1997390107UL, 1166042035UL, 777222656UL,  310392377UL,  727691406UL,  1859988297UL, 748418753UL,
+    1512121146UL, 282371918UL,  518211580UL,  1838690819UL, 1150388137UL, 261620144UL,  618538392UL,  413244890UL,
+    510439724UL,  898065711UL,  1182427040UL, 1155362405UL, 807412979UL,  564218038UL,  2119051887UL, 925145052UL,
+    519229822UL,  247867682UL,  480050533UL,  1366046301UL, 470032860UL,  113972314UL,  1616279283UL, 996871858UL,
+    386238352UL,  778793146UL,  915816770UL,  1521545401UL, 1026842036UL, 1800403489UL, 447423858UL,  1100064165UL,
+    1289572412UL, 535054530UL,  1796783127UL, 984668479UL,  998058645UL,  1015037727UL, 1650346935UL, 1842496470UL,
+    1102090366UL, 1912698225UL, 1695337341UL, 1418480663UL, 1631197292UL, 1042292546UL, 1275248462UL, 415665924UL,
+    743314600UL,  1394894760UL, 2009983523UL, 1712260240UL, 1969039246UL, 1745072939UL, 1272686940UL, 224146997UL,
+    1981917414UL, 438193331UL,  1854913282UL, 1797865301UL, 1283429549UL, 1976595373UL, 240618989UL,  1642207031UL,
+    1335769334UL, 393130505UL,  1187593140UL, 74547441UL,   1709746963UL, 1392421952UL, 828432145UL,  549750925UL,
+    57957751UL,   753129533UL,  566209211UL,  1277212424UL, 1596808150UL, 980591588UL,  1277075858UL, 1588691103UL,
+    241913860UL,  2037053469UL, 1499909333UL, 865925001UL,  1932971573UL, 285125005UL,  1264844865UL, 1190520335UL,
+    467518462UL,  434802058UL,  730389592UL,  1364603335UL, 1555579625UL, 1717526335UL, 291301452UL,  1049319571UL,
+    243990984UL,  818001764UL,  1253629638UL, 508871663UL,  75060420UL,   907165509UL,  782490882UL,  1945448090UL,
+    862643192UL,  1907807817UL, 1161153562UL, 1909531309UL, 1161732093UL, 1677467064UL, 1067464456UL, 923243697UL,
+    153970482UL,  109839414UL,  1267824678UL, 364881881UL,  764667453UL,  1744697056UL, 678464435UL,  2092264800UL,
+    91634841UL,   94826086UL,   1874546509UL, 1151973544UL, 1971183163UL, 1354815980UL, 1499294860UL, 2026907079UL,
+    530087070UL,  120537945UL,  1706537525UL, 1249241294UL, 314337181UL,  1153737678UL, 1911992262UL, 521319367UL,
+    896693686UL,  1197954458UL, 1121548746UL, 526511396UL,  344270934UL,  1207609708UL, 699957904UL,  1409143495UL,
+    1463552426UL, 931961688UL,  1074711333UL, 731795620UL,  67988182UL,   1359009425UL, 477423201UL,  1371040154UL,
+    51685583UL,   1168104096UL, 315495328UL,  189298626UL,  107564996UL,  401834287UL,  2035596113UL, 1667098253UL,
+    407664994UL,  1661098734UL, 1165517897UL, 738175501UL,  265050493UL,  134419519UL,  416940837UL,  408406789UL,
+    112326446UL,  2126603330UL, 1251581048UL, 171180273UL,  210361517UL,  1251154678UL, 418069193UL,  295649553UL,
+    1144675908UL, 1727708472UL, 749621995UL,  1965300985UL, 1974645933UL, 485830912UL,  1309812790UL, 1918736665UL,
+    185853209UL,  1505322740UL, 1503684525UL, 828940259UL,  329070244UL,  640554171UL,  153177604UL,  1454343179UL,
+    690600788UL,  896623835UL,  1630510870UL, 1330994831UL, 636045335UL,  1516827796UL, 713805421UL,  1883913158UL,
+    424598409UL,  1604284201UL, 1723979532UL, 68845886UL,   646433755UL,  1784127550UL, 1169680777UL, 334182951UL,
+    1226650035UL, 243155863UL,  1290337362UL, 556265837UL,  342752839UL,  1820117892UL, 343684843UL,  198862778UL,
+    1345322639UL, 2038005541UL, 336250174UL,  1835631464UL, 1186821557UL, 976596477UL,  1421249319UL, 2040807543UL,
+    1258084751UL, 2101015135UL, 1108519796UL, 2051123725UL, 148693225UL,  1557809047UL, 191569593UL,  403546394UL,
+};
+uint32_t rand_arr_32_b32_w32_arr[1024] = {
+    3152282913UL, 1879021569UL, 1892859779UL, 3019222576UL, 1996093785UL, 1095269794UL, 2393585344UL, 1276091941UL,
+    1584878544UL, 1411769637UL, 4055526742UL, 1092570438UL, 3416537566UL, 1804222745UL, 1480666899UL, 96712452UL,
+    4034748790UL, 6633319UL,    851269320UL,  1051220594UL, 3674557819UL, 12867050UL,   1379408372UL, 2690330443UL,
+    1781815485UL, 4104745135UL, 2125864104UL, 1827787338UL, 2907728967UL, 787628102UL,  2164584269UL, 3618931889UL,
+    506497402UL,  781729057UL,  302864002UL,  3625454260UL, 1055277589UL, 3752765947UL, 1071629229UL, 706850640UL,
+    2846943135UL, 2642194833UL, 3884426506UL, 2953495754UL, 3077117183UL, 405645204UL,  2202778072UL, 3988765313UL,
+    127348493UL,  1933397226UL, 2333855436UL, 480174999UL,  1736509094UL, 1225112359UL, 2234164726UL, 264582672UL,
+    683493880UL,  3731501361UL, 2801572683UL, 2480212609UL, 3919692578UL, 807491525UL,  2720313539UL, 3297074525UL,
+    3084216230UL, 466715695UL,  2820867197UL, 1935614542UL, 2164858849UL, 361684920UL,  1320986938UL, 218123575UL,
+    2182096463UL, 2463460766UL, 1646557490UL, 4267381466UL, 756480785UL,  4212121601UL, 3215538382UL, 1844397839UL,
+    2734724439UL, 2854232483UL, 1931880648UL, 2856616043UL, 2425122664UL, 2872729451UL, 449284251UL,  1912797735UL,
+    978062358UL,  2202634175UL, 2447430113UL, 360284449UL,  1157998587UL, 1999515835UL, 3340884204UL, 921064992UL,
+    135405162UL,  600733273UL,  3691504641UL, 2458134935UL, 3456641519UL, 560364525UL,  235164042UL,  1879407364UL,
+    3632645136UL, 1174036826UL, 583301885UL,  2648751427UL, 2674071551UL, 3212703842UL, 1322243248UL, 721844438UL,
+    2771644063UL, 2255386454UL, 1486405101UL, 2316057054UL, 2473470371UL, 33352096UL,   2843922249UL, 3203459474UL,
+    429867863UL,  2618749113UL, 1561036133UL, 2095350186UL, 1116294103UL, 2832302081UL, 2969890245UL, 2662675079UL,
+    1179191499UL, 677403490UL,  3725269701UL, 4027220774UL, 1246128343UL, 2945209223UL, 479912928UL,  2752347148UL,
+    710909435UL,  1917763203UL, 2110864138UL, 883384992UL,  2121405574UL, 1186537632UL, 2907435688UL, 2027369551UL,
+    66002197UL,   1924379274UL, 1130430880UL, 3193625063UL, 3897619076UL, 2390427916UL, 2011255861UL, 3240184695UL,
+    2429903983UL, 1024779234UL, 2002626377UL, 1103213708UL, 3177149084UL, 1810319949UL, 3265603783UL, 2873948920UL,
+    1568770840UL, 3676520534UL, 3772611024UL, 32099560UL,   630701875UL,  747788998UL,  2366579196UL, 3911367729UL,
+    310986641UL,  991782863UL,  869079642UL,  3719280236UL, 3771622269UL, 312301065UL,  394353004UL,  3927349095UL,
+    2338917239UL, 2463189114UL, 433653143UL,  2434798510UL, 2390645512UL, 356302337UL,  3642405697UL, 2965860648UL,
+    2838695493UL, 790552129UL,  385077810UL,  2396134500UL, 2294348282UL, 3618085223UL, 3906391966UL, 595827482UL,
+    1308535123UL, 3857498713UL, 740117627UL,  1003141241UL, 1206231583UL, 655401900UL,  1944117363UL, 4247397241UL,
+    3345130125UL, 3026777296UL, 3937542951UL, 453243196UL,  153403032UL,  1207491583UL, 216113675UL,  3769516367UL,
+    3483013951UL, 2636060706UL, 2319511538UL, 1407875618UL, 2705556223UL, 2126988729UL, 1459241162UL, 388643853UL,
+    233600000UL,  3562608792UL, 3471659682UL, 3908348322UL, 4228878489UL, 3374041350UL, 3219095687UL, 2144965991UL,
+    4149413787UL, 4045145432UL, 275103177UL,  2966028823UL, 430494967UL,  3474179758UL, 2682165969UL, 1561943140UL,
+    3109410808UL, 3527718562UL, 3344636054UL, 459320277UL,  2034350495UL, 2436508851UL, 1520389114UL, 1078547051UL,
+    693924916UL,  2772728729UL, 3874772172UL, 418869376UL,  2546947102UL, 3695631827UL, 3038377061UL, 2021834034UL,
+    1332099028UL, 4195618526UL, 1891156902UL, 1902859567UL, 870445780UL,  2188994913UL, 414746341UL,  2676769652UL,
+    3872950860UL, 902969749UL,  2967428170UL, 3814673503UL, 2347896274UL, 1638371626UL, 2024105566UL, 2434295658UL,
+    1729403142UL, 1742631692UL, 2326963888UL, 2184037278UL, 2507525200UL, 2746229832UL, 4058777670UL, 1998710962UL,
+    4101962450UL, 3122179034UL, 4079105747UL, 2427266992UL, 977883909UL,  3429505078UL, 1894003140UL, 1950811630UL,
+    2676311750UL, 2804031036UL, 2703946779UL, 1297996021UL, 2482101845UL, 3244144520UL, 4007738844UL, 424383409UL,
+    2189267019UL, 2672198699UL, 3436965671UL, 42398140UL,   376770227UL,  1244904127UL, 198456172UL,  1281685223UL,
+    3770152390UL, 4081827037UL, 2305075179UL, 2272150614UL, 3570896351UL, 3855271218UL, 1082627466UL, 2125202416UL,
+    4130468389UL, 2335793569UL, 347211262UL,  3380035821UL, 3893301576UL, 3539074858UL, 4107403062UL, 2496529097UL,
+    4010024055UL, 1659092486UL, 635459152UL,  1153864365UL, 1392460674UL, 1363619775UL, 2424145552UL, 3059207871UL,
+    3762091976UL, 3643108422UL, 1591053940UL, 2084694779UL, 2273911020UL, 352419634UL,  4240144176UL, 363844992UL,
+    2238903753UL, 3339096473UL, 2550710991UL, 1636378613UL, 2939206257UL, 2501042687UL, 3978030030UL, 731592210UL,
+    2505881841UL, 4116123604UL, 3395758004UL, 2402716197UL, 4029167940UL, 2229676609UL, 820090602UL,  3438027710UL,
+    1079591116UL, 3745217999UL, 499129024UL,  248920559UL,  1342369493UL, 3504338854UL, 3983251522UL, 441901193UL,
+    2044883550UL, 2621484966UL, 3141445588UL, 1424249315UL, 361419040UL,  856325500UL,  3750377910UL, 1183871716UL,
+    3923023636UL, 2334080078UL, 3210367092UL, 71573749UL,   183320309UL,  1828068286UL, 3087928850UL, 3351900767UL,
+    490720925UL,  3310197565UL, 2525266120UL, 3140515863UL, 55315047UL,   1755683025UL, 141214487UL,  1608151260UL,
+    2682283550UL, 238518106UL,  1361575730UL, 2504333848UL, 2109375779UL, 3797931963UL, 4005031861UL, 465922069UL,
+    3690668955UL, 696002850UL,  2166897916UL, 4149600180UL, 2265912525UL, 389733534UL,  1253134937UL, 2728527685UL,
+    1916007381UL, 3992693900UL, 1271049516UL, 3326703077UL, 1562607711UL, 1389604475UL, 3677921330UL, 544622957UL,
+    2709985217UL, 2924210473UL, 1699275490UL, 1160539549UL, 2825035265UL, 3124138393UL, 4250188390UL, 3095904564UL,
+    1755562613UL, 1997575880UL, 2552597855UL, 324102784UL,  1642354974UL, 2770932779UL, 3120899385UL, 254628496UL,
+    4200913271UL, 2576620448UL, 4215882685UL, 3961084506UL, 590498396UL,  4187866058UL, 3979783566UL, 2346571674UL,
+    454298399UL,  2073177150UL, 1182164660UL, 1839769909UL, 3811719600UL, 1271154778UL, 4017565834UL, 3825916296UL,
+    598083280UL,  2931836349UL, 3003782290UL, 3015294293UL, 218778093UL,  4192374621UL, 1164766774UL, 333265194UL,
+    2167196843UL, 2293622921UL, 2079724521UL, 2488083569UL, 2710026564UL, 4037260187UL, 402941173UL,  2627283090UL,
+    3886704023UL, 1786791409UL, 245371951UL,  4246744764UL, 1493555214UL, 3154366881UL, 4006144529UL, 4226366207UL,
+    2596395073UL, 2953400107UL, 1209227586UL, 2396085488UL, 1893686139UL, 3160928656UL, 2854727217UL, 3080550283UL,
+    463866072UL,  1963950019UL, 3086237325UL, 580335195UL,  3238056880UL, 380801736UL,  789186245UL,  554780535UL,
+    2331564786UL, 1825343957UL, 2281926460UL, 3855304334UL, 4202565503UL, 1103671586UL, 682236132UL,  1774847383UL,
+    4071327397UL, 1753793562UL, 4208531187UL, 3903088861UL, 993724615UL,  4108618911UL, 564258328UL,  1591488435UL,
+    1490431074UL, 1058254747UL, 341032770UL,  3027058763UL, 1111227773UL, 2822363313UL, 1555432150UL, 1747193126UL,
+    1957764837UL, 2224370822UL, 2837351780UL, 2982463751UL, 3750737464UL, 2217055995UL, 2952965803UL, 1757036032UL,
+    4004806100UL, 2808339118UL, 4093512447UL, 173496162UL,  3291014241UL, 1980183999UL, 3601030606UL, 2436519530UL,
+    3171137305UL, 1907433259UL, 2179237969UL, 2182583868UL, 4186979394UL, 64233206UL,   2761969423UL, 267481066UL,
+    2024644042UL, 4291791154UL, 3704619289UL, 4196575505UL, 3573176846UL, 1540840764UL, 2058886121UL, 707218048UL,
+    3739843440UL, 2001016911UL, 1624139677UL, 315632355UL,  525164647UL,  1446502561UL, 3215159913UL, 2245456786UL,
+    3164994253UL, 4043191323UL, 3634282327UL, 3436787605UL, 4213499759UL, 2551664984UL, 468407771UL,  1207364554UL,
+    605665519UL,  2329984740UL, 1647683319UL, 618300911UL,  674602257UL,  1326250575UL, 1453011735UL, 478626270UL,
+    111794687UL,  221758157UL,  2038358607UL, 3742632407UL, 3026015161UL, 42584252UL,   32190388UL,   70395699UL,
+    4123340401UL, 1362437915UL, 336917124UL,  2342907913UL, 1357566624UL, 486048297UL,  3115650810UL, 245021163UL,
+    4185925015UL, 894938695UL,  3143611051UL, 2257303609UL, 3306181112UL, 583235506UL,  3801814843UL, 5276821UL,
+    566482491UL,  1233887295UL, 3857564951UL, 1687045766UL, 316898359UL,  203306777UL,  4057176398UL, 3039192405UL,
+    1595251509UL, 2102106779UL, 3696374798UL, 3286362228UL, 1825912827UL, 1552497862UL, 2469153531UL, 2503171281UL,
+    1669058778UL, 4034506483UL, 2329902370UL, 2687844069UL, 3795880156UL, 490528715UL,  388309263UL,  2749979130UL,
+    3445629153UL, 588498920UL,  979214553UL,  1753643410UL, 2620843649UL, 2960362643UL, 2803278537UL, 4156217454UL,
+    65778477UL,   2995856512UL, 1488596145UL, 611745899UL,  1273080565UL, 1964929010UL, 1456907466UL, 3914074404UL,
+    3933865567UL, 3986397219UL, 606994107UL,  1576687545UL, 1791391851UL, 3259138799UL, 2374692044UL, 1568158956UL,
+    3422106702UL, 1896563692UL, 3474618329UL, 4097653845UL, 3277745629UL, 3453819634UL, 962537691UL,  2413298825UL,
+    135613447UL,  1947597994UL, 2827619319UL, 758037325UL,  2495351383UL, 1908973100UL, 2039112517UL, 205996491UL,
+    701047909UL,  3172511757UL, 2177038990UL, 948451473UL,  3742736740UL, 42267322UL,   27891672UL,   3275513671UL,
+    510511357UL,  3446345746UL, 349849952UL,  2884657263UL, 3935470666UL, 2303764146UL, 1649911652UL, 896890586UL,
+    3303108880UL, 3387319591UL, 1366345231UL, 2984884444UL, 4247179013UL, 1629213371UL, 1797454038UL, 236693081UL,
+    4279104513UL, 1442507247UL, 2303876077UL, 454347454UL,  2095988880UL, 179405757UL,  618423403UL,  838742574UL,
+    1688453760UL, 2112642627UL, 1741004116UL, 2243327610UL, 1391858582UL, 557819359UL,  1635589714UL, 2686849645UL,
+    2822834797UL, 3098254323UL, 1712101308UL, 1660722978UL, 4240274921UL, 1827985815UL, 1192956149UL, 468452631UL,
+    2153053816UL, 1000939560UL, 26525957UL,   450824926UL,  243387844UL,  809144893UL,  1029188435UL, 2640369421UL,
+    101552123UL,  2992921987UL, 1823970232UL, 2311501591UL, 3649665893UL, 836142043UL,  1537464878UL, 2821641641UL,
+    3062030757UL, 1681319114UL, 119644825UL,  3249528577UL, 1403828545UL, 609283780UL,  2675707832UL, 2678322346UL,
+    74521164UL,   718409473UL,  253609020UL,  446635620UL,  412432779UL,  3060793110UL, 2661819254UL, 879108628UL,
+    3702195977UL, 3305999672UL, 1001818000UL, 1504584148UL, 4018371225UL, 3334687263UL, 1314451819UL, 309757271UL,
+    383145500UL,  434534845UL,  2325983762UL, 2111629533UL, 4066360374UL, 2004461727UL, 1015452500UL, 3094170713UL,
+    3609129622UL, 842708100UL,  4281021696UL, 1261621289UL, 2216166254UL, 1343386545UL, 2760450117UL, 3268140044UL,
+    298174949UL,  910908975UL,  4056226798UL, 1693338144UL, 2349316307UL, 2680997205UL, 3576717504UL, 4149706560UL,
+    3660373046UL, 912715709UL,  1126077205UL, 2098822959UL, 3356477414UL, 1456864433UL, 2315784923UL, 3462140892UL,
+    2284541799UL, 781782225UL,  1219027456UL, 4115835325UL, 3136028569UL, 3429564254UL, 561284419UL,  801108087UL,
+    271743115UL,  1002986534UL, 3632237392UL, 3181382528UL, 2126309235UL, 637420195UL,  3120919047UL, 3499410141UL,
+    2706008836UL, 3640474888UL, 2104669917UL, 637284111UL,  4016144130UL, 4025517842UL, 148536561UL,  2855031632UL,
+    2556601728UL, 912985646UL,  1864313464UL, 585880849UL,  1426826028UL, 8289025UL,    1276603344UL, 1367198421UL,
+    83600868UL,   485257713UL,  4247515405UL, 2755415775UL, 2162677429UL, 851189653UL,  1095935934UL, 2895091209UL,
+    23014907UL,   1019227220UL, 2250641970UL, 3087780290UL, 1558805513UL, 3507371557UL, 3891558877UL, 4118618845UL,
+    3234847271UL, 3445824549UL, 3916352742UL, 3685249590UL, 189493468UL,  1853816793UL, 1399041120UL, 1434545592UL,
+    1141706742UL, 3140992579UL, 3213482163UL, 1641574401UL, 2605460547UL, 2910653007UL, 3681944300UL, 960791287UL,
+    2178757087UL, 2852230922UL, 863645289UL,  3882595586UL, 961696113UL,  1146684161UL, 3504203647UL, 3933716285UL,
+    477053286UL,  2303498094UL, 4034269112UL, 428435746UL,  3395852218UL, 3855458591UL, 1687619641UL, 2907952164UL,
+    3486933838UL, 3718533850UL, 2280442826UL, 3920344550UL, 3258182432UL, 2372689747UL, 3809523979UL, 959124798UL,
+    1436278358UL, 1529980099UL, 1645858863UL, 1076918962UL, 3883058247UL, 4263406731UL, 1434201181UL, 1726476581UL,
+    1904957502UL, 1769535602UL, 3301242049UL, 2875929526UL, 2629307126UL, 4199204009UL, 101068027UL,  4174365683UL,
+    2101511274UL, 3496597132UL, 1679702154UL, 772280682UL,  3177586874UL, 2888594852UL, 3928824113UL, 3828849898UL,
+    21302254UL,   3656887819UL, 781281439UL,  3416656793UL, 3056138237UL, 1204686080UL, 692287304UL,  950703545UL,
+    1770408446UL, 1052063010UL, 3698513631UL, 4225441993UL, 1372059470UL, 4284836977UL, 208994093UL,  1407497236UL,
+    84599298UL,   769568359UL,  3435124595UL, 3792417743UL, 3608798725UL, 3988354441UL, 334524090UL,  47692017UL,
+    3197679897UL, 530271095UL,  1304750125UL, 3472360221UL, 1733428346UL, 2123419839UL, 2960321204UL, 2564959497UL,
+    4270596480UL, 709777142UL,  1281192299UL, 967564035UL,  182449160UL,  904055868UL,  4194762340UL, 4271806475UL,
+    58978103UL,   3629066273UL, 412994892UL,  4163185779UL, 390517706UL,  1456299286UL, 1218449726UL, 4020215551UL,
+    2590923119UL, 3635821681UL, 1526284324UL, 321108748UL,  3241881460UL, 2117868815UL, 3183902970UL, 2115774557UL,
+    4154244983UL, 1972973568UL, 322530868UL,  626466869UL,  3231619298UL, 2275849840UL, 2905910106UL, 1674321483UL,
+    1492913968UL, 698238356UL,  2606616923UL, 226746094UL,  2635404238UL, 2012602312UL, 2447069417UL, 4165460693UL,
+    163470373UL,  465608332UL,  1099515923UL, 4163292973UL, 1649201176UL, 4020487763UL, 2577523042UL, 3341863670UL,
+    3823372154UL, 88965256UL,   2806687578UL, 843274919UL,  1455097079UL, 3341925499UL, 1034836923UL, 608009140UL,
+    955119411UL,  690779624UL,  2355391919UL, 2151920742UL, 2311570375UL, 345121614UL,  317856274UL,  4216881235UL,
+    1812855286UL, 3016266244UL, 2422653194UL, 3267641414UL, 1429157431UL, 3292747338UL, 3723478823UL, 2133703562UL,
+    2921625021UL, 3981528725UL, 55137450UL,   3357123098UL, 2022800406UL, 3687543363UL, 2924030555UL, 4030739020UL,
+    2020060496UL, 2429589109UL, 1867469418UL, 404683981UL,  1671285336UL, 1739327680UL, 1496907640UL, 481445478UL,
+    2074549786UL, 2996395525UL, 2069093089UL, 2838801369UL, 3910221989UL, 4293624691UL, 3305815030UL, 254648932UL,
+    3700228637UL, 594599207UL,  3619420603UL, 2535725817UL, 2153878969UL, 3667751210UL, 1152019553UL, 3087374633UL,
+    223597380UL,  1564035666UL, 312914946UL,  4146357590UL, 3168780642UL, 108170521UL,  4219893941UL, 2595492655UL,
+    441961202UL,  4093033359UL, 3088346996UL, 1946731226UL, 3007747772UL, 3006611901UL, 3809277666UL, 2896046715UL,
+};
+} // namespace helper
diff --git a/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_src.cu b/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_src.cu
new file mode 100644
index 0000000..e69de29
diff --git a/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_test.cu b/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_test.cu
new file mode 100644
index 0000000..d48cda2
--- /dev/null
+++ b/fastlanes/generated/cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_test.cu
@@ -0,0 +1,366 @@
+// generated!
+#include "cuda_fused_t32_1024_uf1_unpack_helper.hpp"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gtest/gtest.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <fastlanes.cuh>
+class cuda_fused_t32_1024_uf1_unpack : public ::testing::Test {
+
+public:
+	uint64_t  warp_sz {};
+	uint64_t  n_vec {};
+	uint64_t  vec_sz {};
+	uint64_t  n_tup {};
+	uint64_t  v_blc_sz {};
+	uint64_t  n_blc {};
+	uint64_t  n_trd {};
+	uint32_t* d_decoded_arr {nullptr};
+	uint32_t* h_decoded_arr {};
+	uint32_t* packed32;
+	uint32_t* unpacked32;
+	uint32_t* d_encoded_arr;
+
+	void SetUp() override {
+
+		n_tup         = 1024;
+		n_trd         = 32;
+		n_blc         = 1;
+		packed32      = new uint32_t[1024]();
+		unpacked32    = new uint32_t[1024]();
+		h_decoded_arr = new uint32_t[1024]();
+		CUDA_SAFE_CALL(cudaMalloc((void**)&d_decoded_arr, sizeof(uint32_t) * n_tup));
+	}
+	~cuda_fused_t32_1024_uf1_unpack() override {}
+};
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_0_bw_0_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_0_b0_w32_arr, packed32, 0);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 0);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_0_b0_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_1_bw_1_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_1_b1_w32_arr, packed32, 1);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 1);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_1_b1_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_2_bw_2_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_2_b2_w32_arr, packed32, 2);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 2);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_2_b2_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_3_bw_3_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_3_b3_w32_arr, packed32, 3);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 3);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_3_b3_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_4_bw_4_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_4_b4_w32_arr, packed32, 4);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 4);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_4_b4_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_5_bw_5_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_5_b5_w32_arr, packed32, 5);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 5);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_5_b5_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_6_bw_6_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_6_b6_w32_arr, packed32, 6);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 6);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_6_b6_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_7_bw_7_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_7_b7_w32_arr, packed32, 7);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 7);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_7_b7_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_8_bw_8_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_8_b8_w32_arr, packed32, 8);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 8);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_8_b8_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_9_bw_9_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_9_b9_w32_arr, packed32, 9);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 9);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_9_b9_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_10_bw_10_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_10_b10_w32_arr, packed32, 10);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 10);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_10_b10_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_11_bw_11_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_11_b11_w32_arr, packed32, 11);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 11);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_11_b11_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_12_bw_12_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_12_b12_w32_arr, packed32, 12);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 12);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_12_b12_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_13_bw_13_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_13_b13_w32_arr, packed32, 13);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 13);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_13_b13_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_14_bw_14_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_14_b14_w32_arr, packed32, 14);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 14);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_14_b14_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_15_bw_15_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_15_b15_w32_arr, packed32, 15);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 15);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_15_b15_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_16_bw_16_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_16_b16_w32_arr, packed32, 16);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 16);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_16_b16_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_17_bw_17_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_17_b17_w32_arr, packed32, 17);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 17);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_17_b17_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_18_bw_18_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_18_b18_w32_arr, packed32, 18);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 18);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_18_b18_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_19_bw_19_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_19_b19_w32_arr, packed32, 19);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 19);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_19_b19_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_20_bw_20_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_20_b20_w32_arr, packed32, 20);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 20);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_20_b20_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_21_bw_21_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_21_b21_w32_arr, packed32, 21);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 21);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_21_b21_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_22_bw_22_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_22_b22_w32_arr, packed32, 22);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 22);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_22_b22_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_23_bw_23_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_23_b23_w32_arr, packed32, 23);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 23);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_23_b23_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_24_bw_24_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_24_b24_w32_arr, packed32, 24);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 24);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_24_b24_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_25_bw_25_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_25_b25_w32_arr, packed32, 25);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 25);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_25_b25_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_26_bw_26_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_26_b26_w32_arr, packed32, 26);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 26);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_26_b26_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_27_bw_27_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_27_b27_w32_arr, packed32, 27);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 27);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_27_b27_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_28_bw_28_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_28_b28_w32_arr, packed32, 28);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 28);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_28_b28_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_29_bw_29_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_29_b29_w32_arr, packed32, 29);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 29);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_29_b29_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_30_bw_30_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_30_b30_w32_arr, packed32, 30);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 30);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_30_b30_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_31_bw_31_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_31_b31_w32_arr, packed32, 31);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 31);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_31_b31_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_fused_t32_1024_uf1_unpack, test_32_bw_32_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_32_b32_w32_arr, packed32, 32);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 32);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_32_b32_w32_arr[i], h_decoded_arr[i]);
+	}
+}
diff --git a/fastlanes/generated/cuda/fused_t32_uf1/unpack.cmake b/fastlanes/generated/cuda/fused_t32_uf1/unpack.cmake
new file mode 100644
index 0000000..c949a39
--- /dev/null
+++ b/fastlanes/generated/cuda/fused_t32_uf1/unpack.cmake
@@ -0,0 +1,19 @@
+add_library(cuda_fused_t32_1024_uf1_unpack OBJECT
+            cuda_fused_t32_1024_uf1_unpack_src.cu)
+target_compile_definitions(cuda_fused_t32_1024_uf1_unpack PRIVATE IS_SCALAR)
+
+target_compile_options(cuda_fused_t32_1024_uf1_unpack PUBLIC ${FLAG})
+cmake_print_properties(TARGETS cuda_fused_t32_1024_uf1_unpack
+                       PROPERTIES COMPILE_DEFINITIONS
+                       PROPERTIES COMPILE_OPTIONS)
+LIST (APPEND FLS_GENERATED_OBJECT_FILES
+      $<TARGET_OBJECTS:cuda_fused_t32_1024_uf1_unpack>)
+get_target_property(TARGET_NAME cuda_fused_t32_1024_uf1_unpack NAME)
+get_target_property(TARGET_COMPILE_OPTIONS cuda_fused_t32_1024_uf1_unpack COMPILE_OPTIONS)
+#------------------------------------------------------------------------------------------------------
+add_executable(cuda_fused_t32_1024_uf1_unpack_test cuda_fused_t32_1024_uf1_unpack_test.cu)
+target_link_libraries(cuda_fused_t32_1024_uf1_unpack_test PRIVATE cuda_fused_t32_1024_uf1_unpack)
+target_link_libraries(cuda_fused_t32_1024_uf1_unpack_test PRIVATE gtest_main fastlanes_gpu)
+#------------------------------------------------------------------------------------------------------
+add_executable(cuda_fused_t32_1024_uf1_unpack_bench cuda_fused_t32_1024_uf1_unpack_bench.cu)
+target_link_libraries(cuda_fused_t32_1024_uf1_unpack_bench PRIVATE cuda_fused_t32_1024_uf1_unpack fastlanes_gpu)
diff --git a/fastlanes/generated/cuda/normal_t32_uf1/CMakeLists.txt b/fastlanes/generated/cuda/normal_t32_uf1/CMakeLists.txt
new file mode 100644
index 0000000..239ffed
--- /dev/null
+++ b/fastlanes/generated/cuda/normal_t32_uf1/CMakeLists.txt
@@ -0,0 +1,38 @@
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake)
+else()
+endif()
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake")
+ include(${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake)
+else()
+endif()
+set(FLS_GENERATED_OBJECT_FILES
+    ${FLS_GENERATED_OBJECT_FILES} PARENT_SCOPE)
diff --git a/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_bench.cu b/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_bench.cu
new file mode 100644
index 0000000..523ad2a
--- /dev/null
+++ b/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_bench.cu
@@ -0,0 +1,1047 @@
+// generated!
+
+#include "fastlanes.cuh"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/unpack/unpack.cuh"
+#include <iostream>
+
+const uint64_t warp_sz         = 32;
+const uint64_t n_vec           = 256 * 1024;
+const uint64_t vec_sz          = 1024;
+const uint64_t n_tup           = vec_sz * n_vec;
+const uint64_t v_blc_sz        = 1;
+const uint64_t n_blc           = n_vec / v_blc_sz;
+const uint64_t n_trd           = v_blc_sz * warp_sz;
+auto*          h_org_arr       = new uint32_t[n_tup];
+auto*          h_encoded_data  = new uint32_t[n_tup];
+uint64_t       encoded_arr_bsz = n_tup * sizeof(int);
+uint32_t*      d_decoded_arr   = nullptr;
+auto*          h_decoded_arr   = new uint32_t[n_tup];
+
+static void bench0_unpack_0bw_32ow_32crw_1uf() {
+	auto bitwidth = 0;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench1_unpack_1bw_32ow_32crw_1uf() {
+	auto bitwidth = 1;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench2_unpack_2bw_32ow_32crw_1uf() {
+	auto bitwidth = 2;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench3_unpack_3bw_32ow_32crw_1uf() {
+	auto bitwidth = 3;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench4_unpack_4bw_32ow_32crw_1uf() {
+	auto bitwidth = 4;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench5_unpack_5bw_32ow_32crw_1uf() {
+	auto bitwidth = 5;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench6_unpack_6bw_32ow_32crw_1uf() {
+	auto bitwidth = 6;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench7_unpack_7bw_32ow_32crw_1uf() {
+	auto bitwidth = 7;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench8_unpack_8bw_32ow_32crw_1uf() {
+	auto bitwidth = 8;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench9_unpack_9bw_32ow_32crw_1uf() {
+	auto bitwidth = 9;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench10_unpack_10bw_32ow_32crw_1uf() {
+	auto bitwidth = 10;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench11_unpack_11bw_32ow_32crw_1uf() {
+	auto bitwidth = 11;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench12_unpack_12bw_32ow_32crw_1uf() {
+	auto bitwidth = 12;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench13_unpack_13bw_32ow_32crw_1uf() {
+	auto bitwidth = 13;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench14_unpack_14bw_32ow_32crw_1uf() {
+	auto bitwidth = 14;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench15_unpack_15bw_32ow_32crw_1uf() {
+	auto bitwidth = 15;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench16_unpack_16bw_32ow_32crw_1uf() {
+	auto bitwidth = 16;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench17_unpack_17bw_32ow_32crw_1uf() {
+	auto bitwidth = 17;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench18_unpack_18bw_32ow_32crw_1uf() {
+	auto bitwidth = 18;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench19_unpack_19bw_32ow_32crw_1uf() {
+	auto bitwidth = 19;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench20_unpack_20bw_32ow_32crw_1uf() {
+	auto bitwidth = 20;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench21_unpack_21bw_32ow_32crw_1uf() {
+	auto bitwidth = 21;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench22_unpack_22bw_32ow_32crw_1uf() {
+	auto bitwidth = 22;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench23_unpack_23bw_32ow_32crw_1uf() {
+	auto bitwidth = 23;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench24_unpack_24bw_32ow_32crw_1uf() {
+	auto bitwidth = 24;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench25_unpack_25bw_32ow_32crw_1uf() {
+	auto bitwidth = 25;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench26_unpack_26bw_32ow_32crw_1uf() {
+	auto bitwidth = 26;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench27_unpack_27bw_32ow_32crw_1uf() {
+	auto bitwidth = 27;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench28_unpack_28bw_32ow_32crw_1uf() {
+	auto bitwidth = 28;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench29_unpack_29bw_32ow_32crw_1uf() {
+	auto bitwidth = 29;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench30_unpack_30bw_32ow_32crw_1uf() {
+	auto bitwidth = 30;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench31_unpack_31bw_32ow_32crw_1uf() {
+	auto bitwidth = 31;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+static void bench32_unpack_32bw_32ow_32crw_1uf() {
+	auto bitwidth = 32;
+	if (bitwidth == 32) { bitwidth = 31; };
+	/* generate random numbers. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = rand() % (1 << bitwidth);
+	}
+
+	auto in  = h_org_arr;
+	auto out = h_encoded_data;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(in, out, bitwidth);
+		in  = in + vec_sz;
+		out = out + (bitwidth * vec_sz / 32);
+	}
+
+	auto* d_encoded_arr = fastlanes::gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, fastlanes::gpu::g_allocator);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, bitwidth);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << bitwidth << " failed!" << std::endl;
+			return;
+		}
+	}
+	std::cout << bitwidth << " succes!" << std::endl;
+
+	CLEANUP(d_encoded_arr);
+}
+void benchmark_all() {
+	CUDA_SAFE_CALL(cudaMalloc((void**)&d_decoded_arr, sizeof(uint32_t) * n_tup));
+	bench0_unpack_0bw_32ow_32crw_1uf();
+	bench1_unpack_1bw_32ow_32crw_1uf();
+	bench2_unpack_2bw_32ow_32crw_1uf();
+	bench3_unpack_3bw_32ow_32crw_1uf();
+	bench4_unpack_4bw_32ow_32crw_1uf();
+	bench5_unpack_5bw_32ow_32crw_1uf();
+	bench6_unpack_6bw_32ow_32crw_1uf();
+	bench7_unpack_7bw_32ow_32crw_1uf();
+	bench8_unpack_8bw_32ow_32crw_1uf();
+	bench9_unpack_9bw_32ow_32crw_1uf();
+	bench10_unpack_10bw_32ow_32crw_1uf();
+	bench11_unpack_11bw_32ow_32crw_1uf();
+	bench12_unpack_12bw_32ow_32crw_1uf();
+	bench13_unpack_13bw_32ow_32crw_1uf();
+	bench14_unpack_14bw_32ow_32crw_1uf();
+	bench15_unpack_15bw_32ow_32crw_1uf();
+	bench16_unpack_16bw_32ow_32crw_1uf();
+	bench17_unpack_17bw_32ow_32crw_1uf();
+	bench18_unpack_18bw_32ow_32crw_1uf();
+	bench19_unpack_19bw_32ow_32crw_1uf();
+	bench20_unpack_20bw_32ow_32crw_1uf();
+	bench21_unpack_21bw_32ow_32crw_1uf();
+	bench22_unpack_22bw_32ow_32crw_1uf();
+	bench23_unpack_23bw_32ow_32crw_1uf();
+	bench24_unpack_24bw_32ow_32crw_1uf();
+	bench25_unpack_25bw_32ow_32crw_1uf();
+	bench26_unpack_26bw_32ow_32crw_1uf();
+	bench27_unpack_27bw_32ow_32crw_1uf();
+	bench28_unpack_28bw_32ow_32crw_1uf();
+	bench29_unpack_29bw_32ow_32crw_1uf();
+	bench30_unpack_30bw_32ow_32crw_1uf();
+	bench31_unpack_31bw_32ow_32crw_1uf();
+	bench32_unpack_32bw_32ow_32crw_1uf();
+}
+int main() { benchmark_all(); }
diff --git a/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_helper.hpp b/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_helper.hpp
new file mode 100644
index 0000000..1c0421d
--- /dev/null
+++ b/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_helper.hpp
@@ -0,0 +1,2899 @@
+// generated!
+#include "fls_gen/unpack/unpack.hpp"
+namespace helper {
+uint32_t rand_arr_0_b0_w32_arr[1024] = {
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+};
+uint32_t rand_arr_1_b1_w32_arr[1024] = {
+    1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL,
+    1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL,
+    1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL,
+    0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL,
+    1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL,
+    0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL,
+    1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL,
+    1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL,
+    1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL,
+    1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL,
+    0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL,
+    1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL,
+    0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL,
+    0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL,
+    0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL,
+    1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL,
+    1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL,
+    0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL,
+    0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL,
+    1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL,
+    0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL,
+    0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL,
+    0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL,
+    1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL,
+    1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL,
+    1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL,
+    1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL,
+    0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL,
+    0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL,
+    0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL,
+    1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL,
+    0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL,
+    0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL,
+    0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL,
+    1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL,
+    0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL,
+    1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 1UL,
+    1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 0UL, 0UL,
+    1UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 0UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL,
+};
+uint32_t rand_arr_2_b2_w32_arr[1024] = {
+    0UL, 1UL, 0UL, 0UL, 3UL, 1UL, 1UL, 0UL, 3UL, 2UL, 0UL, 1UL, 2UL, 1UL, 2UL, 1UL, 2UL, 0UL, 3UL, 3UL, 1UL, 0UL, 3UL,
+    1UL, 1UL, 0UL, 0UL, 1UL, 3UL, 3UL, 3UL, 3UL, 3UL, 1UL, 2UL, 1UL, 0UL, 3UL, 1UL, 2UL, 3UL, 3UL, 3UL, 3UL, 1UL, 0UL,
+    3UL, 0UL, 1UL, 1UL, 3UL, 3UL, 1UL, 0UL, 2UL, 2UL, 3UL, 0UL, 3UL, 3UL, 1UL, 0UL, 3UL, 0UL, 3UL, 3UL, 1UL, 0UL, 0UL,
+    1UL, 1UL, 3UL, 2UL, 2UL, 3UL, 1UL, 2UL, 0UL, 0UL, 2UL, 0UL, 2UL, 3UL, 1UL, 0UL, 0UL, 2UL, 3UL, 1UL, 2UL, 2UL, 2UL,
+    3UL, 0UL, 3UL, 1UL, 1UL, 3UL, 1UL, 0UL, 0UL, 0UL, 1UL, 3UL, 1UL, 1UL, 2UL, 0UL, 3UL, 1UL, 3UL, 3UL, 2UL, 2UL, 2UL,
+    3UL, 0UL, 3UL, 3UL, 3UL, 1UL, 0UL, 2UL, 2UL, 0UL, 1UL, 1UL, 3UL, 3UL, 1UL, 1UL, 2UL, 1UL, 1UL, 3UL, 1UL, 1UL, 1UL,
+    3UL, 1UL, 1UL, 3UL, 3UL, 0UL, 2UL, 2UL, 3UL, 1UL, 3UL, 2UL, 1UL, 2UL, 1UL, 2UL, 1UL, 0UL, 3UL, 1UL, 0UL, 0UL, 0UL,
+    2UL, 2UL, 3UL, 3UL, 1UL, 0UL, 2UL, 2UL, 0UL, 0UL, 2UL, 2UL, 2UL, 0UL, 1UL, 3UL, 3UL, 0UL, 2UL, 0UL, 2UL, 3UL, 0UL,
+    2UL, 2UL, 2UL, 1UL, 1UL, 3UL, 2UL, 3UL, 0UL, 0UL, 0UL, 2UL, 0UL, 3UL, 1UL, 0UL, 3UL, 0UL, 0UL, 1UL, 1UL, 1UL, 0UL,
+    0UL, 3UL, 1UL, 0UL, 2UL, 1UL, 1UL, 2UL, 2UL, 2UL, 0UL, 0UL, 3UL, 1UL, 0UL, 3UL, 0UL, 3UL, 3UL, 3UL, 0UL, 3UL, 2UL,
+    2UL, 2UL, 0UL, 2UL, 1UL, 3UL, 2UL, 3UL, 0UL, 2UL, 2UL, 0UL, 0UL, 3UL, 1UL, 1UL, 3UL, 1UL, 0UL, 2UL, 1UL, 3UL, 1UL,
+    1UL, 2UL, 3UL, 0UL, 1UL, 3UL, 3UL, 2UL, 1UL, 3UL, 1UL, 0UL, 1UL, 1UL, 2UL, 1UL, 3UL, 0UL, 2UL, 0UL, 0UL, 2UL, 1UL,
+    3UL, 1UL, 1UL, 1UL, 1UL, 2UL, 0UL, 1UL, 1UL, 2UL, 2UL, 0UL, 1UL, 1UL, 2UL, 1UL, 1UL, 3UL, 3UL, 2UL, 2UL, 0UL, 1UL,
+    0UL, 2UL, 0UL, 0UL, 0UL, 0UL, 0UL, 3UL, 0UL, 2UL, 0UL, 2UL, 0UL, 1UL, 1UL, 2UL, 3UL, 3UL, 0UL, 1UL, 3UL, 1UL, 0UL,
+    2UL, 1UL, 1UL, 2UL, 3UL, 1UL, 1UL, 2UL, 3UL, 3UL, 1UL, 3UL, 2UL, 1UL, 3UL, 3UL, 1UL, 0UL, 1UL, 0UL, 3UL, 1UL, 3UL,
+    1UL, 1UL, 3UL, 1UL, 0UL, 3UL, 3UL, 2UL, 2UL, 3UL, 0UL, 2UL, 0UL, 2UL, 0UL, 2UL, 2UL, 3UL, 1UL, 2UL, 3UL, 1UL, 0UL,
+    0UL, 1UL, 1UL, 0UL, 1UL, 1UL, 1UL, 1UL, 0UL, 3UL, 1UL, 1UL, 1UL, 3UL, 1UL, 1UL, 1UL, 2UL, 0UL, 3UL, 1UL, 2UL, 0UL,
+    3UL, 2UL, 3UL, 3UL, 0UL, 2UL, 2UL, 3UL, 0UL, 0UL, 3UL, 1UL, 3UL, 0UL, 0UL, 0UL, 0UL, 0UL, 2UL, 0UL, 2UL, 2UL, 2UL,
+    3UL, 3UL, 1UL, 2UL, 0UL, 0UL, 3UL, 2UL, 0UL, 1UL, 2UL, 1UL, 1UL, 3UL, 2UL, 1UL, 3UL, 2UL, 3UL, 1UL, 1UL, 2UL, 1UL,
+    1UL, 3UL, 0UL, 1UL, 0UL, 2UL, 3UL, 1UL, 2UL, 0UL, 1UL, 2UL, 2UL, 0UL, 3UL, 0UL, 2UL, 3UL, 1UL, 2UL, 1UL, 1UL, 2UL,
+    1UL, 1UL, 1UL, 2UL, 3UL, 3UL, 3UL, 3UL, 0UL, 3UL, 0UL, 0UL, 2UL, 2UL, 0UL, 1UL, 3UL, 0UL, 1UL, 3UL, 1UL, 3UL, 3UL,
+    2UL, 2UL, 2UL, 3UL, 2UL, 2UL, 2UL, 2UL, 1UL, 3UL, 0UL, 1UL, 3UL, 2UL, 1UL, 1UL, 3UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL,
+    1UL, 0UL, 1UL, 2UL, 2UL, 0UL, 3UL, 1UL, 2UL, 2UL, 1UL, 0UL, 1UL, 3UL, 0UL, 0UL, 1UL, 0UL, 1UL, 1UL, 1UL, 0UL, 3UL,
+    0UL, 0UL, 2UL, 1UL, 0UL, 1UL, 0UL, 1UL, 3UL, 0UL, 3UL, 1UL, 3UL, 1UL, 3UL, 1UL, 1UL, 1UL, 2UL, 0UL, 2UL, 0UL, 3UL,
+    0UL, 0UL, 0UL, 3UL, 1UL, 2UL, 3UL, 3UL, 3UL, 2UL, 3UL, 0UL, 1UL, 1UL, 2UL, 0UL, 1UL, 0UL, 2UL, 2UL, 2UL, 2UL, 3UL,
+    1UL, 1UL, 0UL, 3UL, 2UL, 2UL, 3UL, 1UL, 3UL, 0UL, 0UL, 3UL, 0UL, 0UL, 1UL, 1UL, 0UL, 2UL, 1UL, 2UL, 3UL, 0UL, 2UL,
+    0UL, 0UL, 3UL, 0UL, 0UL, 0UL, 2UL, 1UL, 1UL, 1UL, 3UL, 3UL, 3UL, 0UL, 1UL, 2UL, 2UL, 2UL, 3UL, 2UL, 0UL, 0UL, 3UL,
+    1UL, 2UL, 0UL, 2UL, 0UL, 1UL, 0UL, 1UL, 1UL, 2UL, 3UL, 0UL, 0UL, 3UL, 3UL, 2UL, 3UL, 0UL, 3UL, 2UL, 0UL, 0UL, 1UL,
+    2UL, 1UL, 3UL, 2UL, 0UL, 2UL, 2UL, 3UL, 2UL, 0UL, 1UL, 0UL, 2UL, 2UL, 1UL, 1UL, 0UL, 3UL, 1UL, 3UL, 2UL, 1UL, 3UL,
+    0UL, 2UL, 0UL, 3UL, 3UL, 1UL, 2UL, 3UL, 3UL, 3UL, 1UL, 1UL, 2UL, 2UL, 2UL, 3UL, 0UL, 0UL, 0UL, 3UL, 3UL, 0UL, 0UL,
+    0UL, 0UL, 2UL, 2UL, 1UL, 0UL, 3UL, 0UL, 0UL, 0UL, 0UL, 1UL, 1UL, 2UL, 3UL, 3UL, 1UL, 3UL, 3UL, 3UL, 0UL, 2UL, 0UL,
+    0UL, 0UL, 0UL, 1UL, 1UL, 0UL, 2UL, 3UL, 1UL, 2UL, 2UL, 3UL, 0UL, 0UL, 2UL, 0UL, 1UL, 0UL, 3UL, 1UL, 2UL, 0UL, 3UL,
+    1UL, 0UL, 2UL, 3UL, 3UL, 2UL, 2UL, 0UL, 2UL, 3UL, 1UL, 2UL, 3UL, 1UL, 3UL, 0UL, 2UL, 0UL, 3UL, 1UL, 0UL, 3UL, 3UL,
+    2UL, 3UL, 2UL, 0UL, 1UL, 2UL, 3UL, 3UL, 1UL, 3UL, 0UL, 0UL, 0UL, 0UL, 0UL, 2UL, 2UL, 1UL, 2UL, 1UL, 2UL, 3UL, 0UL,
+    0UL, 2UL, 0UL, 0UL, 0UL, 0UL, 1UL, 0UL, 1UL, 3UL, 2UL, 1UL, 3UL, 2UL, 2UL, 3UL, 2UL, 2UL, 3UL, 3UL, 3UL, 1UL, 3UL,
+    1UL, 3UL, 3UL, 0UL, 3UL, 0UL, 2UL, 2UL, 0UL, 0UL, 3UL, 0UL, 1UL, 1UL, 0UL, 0UL, 1UL, 3UL, 3UL, 2UL, 1UL, 0UL, 0UL,
+    2UL, 2UL, 2UL, 3UL, 0UL, 2UL, 0UL, 3UL, 2UL, 3UL, 0UL, 1UL, 0UL, 3UL, 3UL, 3UL, 1UL, 2UL, 3UL, 3UL, 1UL, 0UL, 2UL,
+    3UL, 0UL, 2UL, 3UL, 3UL, 0UL, 1UL, 3UL, 2UL, 2UL, 2UL, 0UL, 1UL, 2UL, 0UL, 0UL, 3UL, 0UL, 0UL, 0UL, 3UL, 2UL, 2UL,
+    1UL, 1UL, 2UL, 2UL, 0UL, 3UL, 1UL, 2UL, 0UL, 0UL, 3UL, 2UL, 3UL, 2UL, 3UL, 2UL, 0UL, 2UL, 2UL, 3UL, 1UL, 1UL, 2UL,
+    0UL, 3UL, 1UL, 1UL, 2UL, 2UL, 1UL, 3UL, 1UL, 2UL, 3UL, 3UL, 0UL, 1UL, 0UL, 1UL, 3UL, 0UL, 1UL, 1UL, 3UL, 1UL, 0UL,
+    0UL, 2UL, 3UL, 1UL, 0UL, 1UL, 2UL, 3UL, 0UL, 2UL, 1UL, 3UL, 2UL, 1UL, 1UL, 3UL, 2UL, 2UL, 2UL, 1UL, 3UL, 2UL, 1UL,
+    2UL, 0UL, 1UL, 3UL, 2UL, 2UL, 1UL, 0UL, 3UL, 2UL, 2UL, 0UL, 0UL, 3UL, 3UL, 1UL, 0UL, 0UL, 1UL, 0UL, 3UL, 3UL, 1UL,
+    1UL, 3UL, 1UL, 3UL, 2UL, 1UL, 1UL, 0UL, 0UL, 0UL, 0UL, 2UL, 1UL, 2UL, 3UL, 1UL, 0UL, 2UL, 2UL, 3UL, 2UL, 1UL, 2UL,
+    0UL, 1UL, 0UL, 1UL, 2UL, 0UL, 1UL, 3UL, 0UL, 1UL, 3UL, 3UL, 2UL, 1UL, 0UL, 1UL, 1UL, 1UL, 2UL, 2UL, 0UL, 3UL, 0UL,
+    1UL, 3UL, 3UL, 0UL, 3UL, 1UL, 2UL, 3UL, 2UL, 1UL, 1UL, 3UL,
+};
+uint32_t rand_arr_3_b3_w32_arr[1024] = {
+    6UL, 0UL, 2UL, 4UL, 1UL, 4UL, 6UL, 1UL, 1UL, 2UL, 2UL, 3UL, 2UL, 7UL, 1UL, 0UL, 1UL, 2UL, 0UL, 7UL, 5UL, 3UL, 0UL,
+    2UL, 1UL, 4UL, 5UL, 5UL, 1UL, 0UL, 6UL, 6UL, 5UL, 6UL, 4UL, 4UL, 0UL, 0UL, 0UL, 5UL, 0UL, 6UL, 4UL, 7UL, 5UL, 5UL,
+    7UL, 0UL, 7UL, 6UL, 3UL, 4UL, 3UL, 0UL, 2UL, 1UL, 0UL, 2UL, 4UL, 1UL, 5UL, 0UL, 7UL, 4UL, 1UL, 4UL, 3UL, 4UL, 6UL,
+    5UL, 0UL, 3UL, 7UL, 2UL, 6UL, 3UL, 7UL, 6UL, 4UL, 0UL, 0UL, 0UL, 5UL, 1UL, 5UL, 0UL, 7UL, 0UL, 3UL, 6UL, 6UL, 5UL,
+    3UL, 5UL, 4UL, 2UL, 1UL, 5UL, 1UL, 2UL, 6UL, 0UL, 2UL, 0UL, 3UL, 0UL, 4UL, 3UL, 3UL, 1UL, 5UL, 7UL, 3UL, 1UL, 3UL,
+    7UL, 4UL, 3UL, 6UL, 5UL, 0UL, 1UL, 1UL, 5UL, 7UL, 4UL, 4UL, 3UL, 3UL, 7UL, 4UL, 7UL, 1UL, 0UL, 1UL, 5UL, 1UL, 0UL,
+    3UL, 7UL, 3UL, 7UL, 6UL, 6UL, 4UL, 6UL, 1UL, 7UL, 3UL, 5UL, 6UL, 3UL, 5UL, 1UL, 5UL, 6UL, 6UL, 3UL, 2UL, 5UL, 3UL,
+    3UL, 6UL, 0UL, 7UL, 6UL, 2UL, 0UL, 6UL, 6UL, 3UL, 0UL, 3UL, 5UL, 2UL, 5UL, 2UL, 0UL, 1UL, 5UL, 1UL, 1UL, 4UL, 4UL,
+    2UL, 3UL, 5UL, 1UL, 1UL, 5UL, 0UL, 0UL, 7UL, 2UL, 6UL, 5UL, 5UL, 5UL, 3UL, 5UL, 2UL, 1UL, 1UL, 5UL, 0UL, 4UL, 0UL,
+    4UL, 1UL, 6UL, 5UL, 3UL, 4UL, 6UL, 3UL, 6UL, 1UL, 5UL, 7UL, 0UL, 1UL, 4UL, 7UL, 7UL, 0UL, 0UL, 3UL, 2UL, 1UL, 1UL,
+    0UL, 1UL, 3UL, 4UL, 0UL, 4UL, 1UL, 0UL, 0UL, 1UL, 5UL, 1UL, 2UL, 3UL, 0UL, 4UL, 6UL, 5UL, 1UL, 5UL, 3UL, 5UL, 4UL,
+    5UL, 7UL, 2UL, 0UL, 5UL, 2UL, 5UL, 5UL, 0UL, 1UL, 0UL, 3UL, 4UL, 6UL, 6UL, 7UL, 6UL, 1UL, 6UL, 5UL, 4UL, 5UL, 7UL,
+    4UL, 6UL, 1UL, 0UL, 3UL, 3UL, 4UL, 7UL, 7UL, 7UL, 0UL, 3UL, 4UL, 7UL, 0UL, 4UL, 6UL, 3UL, 6UL, 5UL, 4UL, 5UL, 5UL,
+    4UL, 1UL, 1UL, 6UL, 6UL, 4UL, 1UL, 3UL, 3UL, 5UL, 0UL, 0UL, 3UL, 4UL, 1UL, 6UL, 2UL, 1UL, 6UL, 2UL, 7UL, 1UL, 0UL,
+    5UL, 6UL, 2UL, 2UL, 1UL, 3UL, 1UL, 4UL, 5UL, 3UL, 7UL, 1UL, 6UL, 2UL, 6UL, 5UL, 4UL, 0UL, 5UL, 0UL, 2UL, 0UL, 6UL,
+    2UL, 2UL, 6UL, 2UL, 6UL, 0UL, 1UL, 4UL, 6UL, 2UL, 0UL, 3UL, 7UL, 5UL, 2UL, 2UL, 1UL, 1UL, 1UL, 0UL, 1UL, 1UL, 4UL,
+    0UL, 4UL, 3UL, 5UL, 0UL, 1UL, 2UL, 4UL, 6UL, 3UL, 6UL, 3UL, 4UL, 5UL, 2UL, 2UL, 4UL, 0UL, 2UL, 7UL, 4UL, 0UL, 3UL,
+    2UL, 4UL, 3UL, 2UL, 6UL, 3UL, 0UL, 4UL, 7UL, 5UL, 0UL, 4UL, 5UL, 3UL, 5UL, 1UL, 2UL, 4UL, 1UL, 4UL, 3UL, 6UL, 1UL,
+    3UL, 0UL, 7UL, 4UL, 1UL, 1UL, 1UL, 1UL, 5UL, 7UL, 6UL, 3UL, 0UL, 7UL, 5UL, 0UL, 0UL, 7UL, 4UL, 6UL, 3UL, 5UL, 5UL,
+    2UL, 3UL, 7UL, 3UL, 3UL, 2UL, 5UL, 4UL, 6UL, 2UL, 5UL, 0UL, 5UL, 1UL, 1UL, 4UL, 5UL, 7UL, 5UL, 0UL, 5UL, 5UL, 0UL,
+    4UL, 7UL, 2UL, 6UL, 6UL, 2UL, 1UL, 2UL, 2UL, 2UL, 4UL, 7UL, 6UL, 2UL, 0UL, 1UL, 6UL, 0UL, 7UL, 2UL, 6UL, 4UL, 5UL,
+    5UL, 5UL, 1UL, 6UL, 3UL, 3UL, 1UL, 3UL, 5UL, 3UL, 5UL, 5UL, 2UL, 6UL, 6UL, 1UL, 6UL, 5UL, 1UL, 0UL, 5UL, 7UL, 7UL,
+    0UL, 4UL, 5UL, 0UL, 3UL, 7UL, 2UL, 6UL, 6UL, 3UL, 1UL, 6UL, 5UL, 6UL, 4UL, 2UL, 7UL, 0UL, 1UL, 4UL, 1UL, 7UL, 2UL,
+    0UL, 2UL, 1UL, 0UL, 0UL, 2UL, 5UL, 4UL, 3UL, 5UL, 1UL, 7UL, 6UL, 0UL, 0UL, 3UL, 2UL, 4UL, 4UL, 5UL, 7UL, 0UL, 1UL,
+    1UL, 2UL, 4UL, 4UL, 1UL, 5UL, 5UL, 2UL, 2UL, 1UL, 2UL, 2UL, 7UL, 7UL, 2UL, 3UL, 4UL, 0UL, 1UL, 0UL, 7UL, 6UL, 2UL,
+    3UL, 1UL, 4UL, 4UL, 1UL, 3UL, 3UL, 2UL, 7UL, 5UL, 4UL, 6UL, 0UL, 7UL, 6UL, 6UL, 7UL, 5UL, 1UL, 2UL, 0UL, 2UL, 7UL,
+    7UL, 4UL, 5UL, 5UL, 4UL, 5UL, 2UL, 5UL, 6UL, 3UL, 4UL, 1UL, 0UL, 0UL, 0UL, 7UL, 2UL, 3UL, 2UL, 3UL, 0UL, 5UL, 3UL,
+    7UL, 2UL, 5UL, 6UL, 4UL, 0UL, 0UL, 2UL, 6UL, 6UL, 0UL, 0UL, 1UL, 6UL, 6UL, 5UL, 4UL, 2UL, 1UL, 3UL, 1UL, 5UL, 6UL,
+    5UL, 1UL, 1UL, 7UL, 4UL, 6UL, 3UL, 1UL, 7UL, 5UL, 2UL, 0UL, 3UL, 5UL, 7UL, 1UL, 0UL, 5UL, 6UL, 5UL, 0UL, 4UL, 1UL,
+    6UL, 1UL, 4UL, 2UL, 0UL, 2UL, 4UL, 0UL, 7UL, 4UL, 4UL, 6UL, 5UL, 1UL, 5UL, 3UL, 0UL, 3UL, 1UL, 1UL, 0UL, 4UL, 2UL,
+    4UL, 5UL, 2UL, 5UL, 2UL, 7UL, 4UL, 4UL, 0UL, 3UL, 7UL, 1UL, 2UL, 0UL, 6UL, 2UL, 3UL, 3UL, 5UL, 1UL, 6UL, 3UL, 7UL,
+    0UL, 2UL, 5UL, 2UL, 4UL, 3UL, 5UL, 7UL, 2UL, 5UL, 7UL, 4UL, 4UL, 6UL, 2UL, 2UL, 4UL, 0UL, 0UL, 5UL, 2UL, 2UL, 2UL,
+    2UL, 4UL, 3UL, 5UL, 2UL, 3UL, 2UL, 1UL, 3UL, 4UL, 5UL, 4UL, 5UL, 5UL, 4UL, 7UL, 0UL, 7UL, 6UL, 7UL, 2UL, 1UL, 4UL,
+    5UL, 6UL, 1UL, 5UL, 6UL, 0UL, 3UL, 3UL, 4UL, 5UL, 1UL, 6UL, 4UL, 5UL, 3UL, 1UL, 0UL, 2UL, 5UL, 5UL, 5UL, 7UL, 0UL,
+    1UL, 5UL, 4UL, 5UL, 6UL, 5UL, 1UL, 5UL, 1UL, 7UL, 3UL, 5UL, 0UL, 3UL, 6UL, 7UL, 1UL, 7UL, 5UL, 5UL, 4UL, 2UL, 5UL,
+    2UL, 6UL, 7UL, 4UL, 5UL, 6UL, 0UL, 1UL, 5UL, 1UL, 6UL, 1UL, 7UL, 1UL, 1UL, 7UL, 4UL, 3UL, 4UL, 2UL, 5UL, 7UL, 0UL,
+    3UL, 6UL, 4UL, 4UL, 3UL, 5UL, 3UL, 5UL, 6UL, 2UL, 1UL, 0UL, 5UL, 4UL, 5UL, 0UL, 1UL, 3UL, 7UL, 7UL, 4UL, 4UL, 2UL,
+    0UL, 7UL, 4UL, 1UL, 5UL, 0UL, 1UL, 3UL, 5UL, 4UL, 7UL, 4UL, 5UL, 5UL, 0UL, 5UL, 7UL, 2UL, 6UL, 2UL, 1UL, 4UL, 2UL,
+    5UL, 0UL, 1UL, 0UL, 2UL, 5UL, 6UL, 4UL, 1UL, 0UL, 5UL, 4UL, 7UL, 7UL, 6UL, 1UL, 5UL, 1UL, 5UL, 4UL, 6UL, 1UL, 3UL,
+    1UL, 6UL, 1UL, 7UL, 0UL, 3UL, 7UL, 7UL, 2UL, 5UL, 1UL, 1UL, 2UL, 5UL, 1UL, 5UL, 6UL, 5UL, 6UL, 4UL, 2UL, 5UL, 0UL,
+    5UL, 3UL, 1UL, 3UL, 6UL, 7UL, 1UL, 7UL, 7UL, 2UL, 3UL, 3UL, 3UL, 3UL, 7UL, 1UL, 0UL, 5UL, 1UL, 7UL, 1UL, 1UL, 5UL,
+    4UL, 2UL, 5UL, 2UL, 2UL, 2UL, 7UL, 0UL, 4UL, 1UL, 1UL, 6UL, 7UL, 3UL, 4UL, 6UL, 1UL, 2UL, 4UL, 7UL, 0UL, 0UL, 5UL,
+    5UL, 1UL, 6UL, 4UL, 6UL, 7UL, 5UL, 0UL, 5UL, 0UL, 7UL, 4UL, 6UL, 0UL, 4UL, 7UL, 1UL, 2UL, 3UL, 4UL, 5UL, 3UL, 1UL,
+    7UL, 3UL, 2UL, 2UL, 6UL, 4UL, 0UL, 2UL, 2UL, 4UL, 3UL, 5UL, 0UL, 7UL, 5UL, 1UL, 3UL, 4UL, 5UL, 7UL, 5UL, 3UL, 1UL,
+    5UL, 3UL, 7UL, 7UL, 3UL, 2UL, 1UL, 1UL, 6UL, 4UL, 4UL, 1UL,
+};
+uint32_t rand_arr_4_b4_w32_arr[1024] = {
+    3UL,  10UL, 7UL,  2UL,  11UL, 6UL,  14UL, 0UL,  1UL,  8UL,  9UL,  9UL,  14UL, 15UL, 13UL, 13UL, 14UL, 11UL, 8UL,
+    6UL,  10UL, 10UL, 7UL,  5UL,  11UL, 2UL,  14UL, 10UL, 7UL,  4UL,  5UL,  12UL, 10UL, 3UL,  11UL, 10UL, 11UL, 8UL,
+    7UL,  7UL,  7UL,  4UL,  11UL, 14UL, 9UL,  6UL,  4UL,  11UL, 13UL, 1UL,  15UL, 0UL,  7UL,  12UL, 14UL, 15UL, 4UL,
+    15UL, 1UL,  4UL,  7UL,  8UL,  7UL,  5UL,  8UL,  7UL,  1UL,  10UL, 8UL,  13UL, 13UL, 6UL,  7UL,  15UL, 14UL, 13UL,
+    8UL,  4UL,  13UL, 11UL, 8UL,  3UL,  3UL,  2UL,  9UL,  4UL,  15UL, 12UL, 10UL, 13UL, 15UL, 7UL,  11UL, 6UL,  6UL,
+    7UL,  7UL,  12UL, 1UL,  3UL,  14UL, 6UL,  8UL,  14UL, 14UL, 4UL,  2UL,  6UL,  6UL,  7UL,  15UL, 10UL, 15UL, 3UL,
+    4UL,  15UL, 1UL,  8UL,  5UL,  13UL, 13UL, 15UL, 7UL,  11UL, 1UL,  12UL, 0UL,  15UL, 11UL, 13UL, 4UL,  10UL, 8UL,
+    7UL,  6UL,  2UL,  0UL,  10UL, 11UL, 3UL,  13UL, 3UL,  2UL,  11UL, 5UL,  9UL,  5UL,  11UL, 9UL,  0UL,  4UL,  7UL,
+    7UL,  12UL, 2UL,  3UL,  8UL,  10UL, 7UL,  12UL, 12UL, 2UL,  4UL,  8UL,  8UL,  5UL,  7UL,  12UL, 8UL,  12UL, 8UL,
+    4UL,  15UL, 13UL, 2UL,  6UL,  8UL,  14UL, 0UL,  6UL,  7UL,  12UL, 10UL, 8UL,  11UL, 9UL,  6UL,  9UL,  12UL, 8UL,
+    5UL,  2UL,  10UL, 11UL, 5UL,  10UL, 7UL,  2UL,  6UL,  12UL, 10UL, 12UL, 13UL, 15UL, 15UL, 0UL,  10UL, 3UL,  3UL,
+    7UL,  6UL,  15UL, 14UL, 11UL, 4UL,  11UL, 9UL,  3UL,  9UL,  5UL,  7UL,  5UL,  12UL, 4UL,  1UL,  4UL,  3UL,  6UL,
+    14UL, 2UL,  4UL,  3UL,  12UL, 11UL, 6UL,  14UL, 12UL, 14UL, 0UL,  8UL,  14UL, 14UL, 2UL,  7UL,  7UL,  0UL,  14UL,
+    7UL,  9UL,  7UL,  15UL, 14UL, 10UL, 2UL,  8UL,  14UL, 9UL,  15UL, 7UL,  12UL, 12UL, 11UL, 8UL,  3UL,  0UL,  0UL,
+    7UL,  8UL,  7UL,  3UL,  11UL, 7UL,  6UL,  15UL, 7UL,  13UL, 6UL,  0UL,  5UL,  8UL,  5UL,  2UL,  2UL,  3UL,  4UL,
+    7UL,  9UL,  3UL,  4UL,  14UL, 14UL, 9UL,  7UL,  3UL,  10UL, 7UL,  15UL, 10UL, 1UL,  6UL,  6UL,  1UL,  1UL,  4UL,
+    13UL, 0UL,  5UL,  4UL,  1UL,  7UL,  9UL,  1UL,  7UL,  1UL,  14UL, 0UL,  7UL,  5UL,  8UL,  9UL,  2UL,  6UL,  9UL,
+    0UL,  2UL,  15UL, 2UL,  10UL, 12UL, 4UL,  3UL,  5UL,  6UL,  14UL, 7UL,  7UL,  11UL, 2UL,  11UL, 3UL,  10UL, 1UL,
+    10UL, 13UL, 11UL, 7UL,  0UL,  2UL,  3UL,  6UL,  12UL, 13UL, 1UL,  13UL, 5UL,  15UL, 1UL,  11UL, 2UL,  13UL, 5UL,
+    13UL, 6UL,  8UL,  15UL, 6UL,  7UL,  0UL,  8UL,  10UL, 4UL,  7UL,  1UL,  3UL,  9UL,  2UL,  1UL,  7UL,  13UL, 13UL,
+    15UL, 6UL,  4UL,  3UL,  7UL,  13UL, 14UL, 10UL, 0UL,  3UL,  12UL, 11UL, 2UL,  1UL,  11UL, 12UL, 14UL, 8UL,  15UL,
+    1UL,  13UL, 14UL, 2UL,  12UL, 5UL,  5UL,  10UL, 3UL,  15UL, 15UL, 0UL,  1UL,  2UL,  13UL, 8UL,  2UL,  8UL,  5UL,
+    8UL,  9UL,  1UL,  1UL,  2UL,  10UL, 13UL, 2UL,  14UL, 13UL, 12UL, 10UL, 11UL, 9UL,  13UL, 2UL,  1UL,  14UL, 2UL,
+    4UL,  10UL, 1UL,  11UL, 15UL, 8UL,  0UL,  2UL,  14UL, 1UL,  4UL,  15UL, 10UL, 13UL, 8UL,  5UL,  14UL, 7UL,  14UL,
+    12UL, 2UL,  6UL,  6UL,  0UL,  2UL,  3UL,  3UL,  9UL,  9UL,  1UL,  6UL,  5UL,  9UL,  12UL, 7UL,  8UL,  15UL, 5UL,
+    5UL,  10UL, 1UL,  0UL,  9UL,  12UL, 11UL, 8UL,  15UL, 11UL, 6UL,  11UL, 14UL, 4UL,  4UL,  8UL,  0UL,  12UL, 8UL,
+    6UL,  11UL, 6UL,  13UL, 4UL,  10UL, 12UL, 10UL, 8UL,  3UL,  13UL, 12UL, 12UL, 1UL,  5UL,  7UL,  10UL, 9UL,  15UL,
+    5UL,  7UL,  12UL, 7UL,  13UL, 14UL, 14UL, 6UL,  8UL,  9UL,  2UL,  3UL,  10UL, 3UL,  11UL, 4UL,  9UL,  12UL, 10UL,
+    15UL, 9UL,  3UL,  14UL, 6UL,  4UL,  1UL,  8UL,  6UL,  7UL,  15UL, 11UL, 1UL,  10UL, 13UL, 5UL,  12UL, 11UL, 7UL,
+    0UL,  8UL,  2UL,  0UL,  12UL, 8UL,  0UL,  0UL,  6UL,  5UL,  5UL,  13UL, 12UL, 11UL, 12UL, 5UL,  5UL,  10UL, 10UL,
+    13UL, 10UL, 13UL, 1UL,  7UL,  1UL,  5UL,  9UL,  7UL,  10UL, 11UL, 5UL,  7UL,  1UL,  12UL, 14UL, 11UL, 6UL,  0UL,
+    5UL,  4UL,  10UL, 5UL,  14UL, 10UL, 7UL,  3UL,  8UL,  0UL,  14UL, 1UL,  6UL,  1UL,  6UL,  14UL, 5UL,  9UL,  2UL,
+    0UL,  15UL, 11UL, 13UL, 0UL,  14UL, 12UL, 15UL, 3UL,  2UL,  13UL, 6UL,  7UL,  6UL,  7UL,  2UL,  10UL, 7UL,  3UL,
+    5UL,  5UL,  3UL,  6UL,  1UL,  12UL, 3UL,  10UL, 8UL,  7UL,  8UL,  15UL, 6UL,  5UL,  15UL, 5UL,  13UL, 10UL, 6UL,
+    4UL,  12UL, 9UL,  6UL,  8UL,  6UL,  7UL,  15UL, 12UL, 12UL, 1UL,  10UL, 15UL, 10UL, 14UL, 6UL,  14UL, 7UL,  1UL,
+    4UL,  9UL,  9UL,  8UL,  11UL, 15UL, 8UL,  8UL,  2UL,  6UL,  12UL, 14UL, 14UL, 5UL,  7UL,  3UL,  0UL,  13UL, 12UL,
+    15UL, 6UL,  13UL, 2UL,  4UL,  4UL,  11UL, 10UL, 4UL,  5UL,  1UL,  10UL, 13UL, 4UL,  9UL,  14UL, 9UL,  10UL, 11UL,
+    5UL,  15UL, 9UL,  2UL,  9UL,  2UL,  6UL,  2UL,  11UL, 4UL,  0UL,  9UL,  11UL, 14UL, 5UL,  11UL, 2UL,  11UL, 5UL,
+    5UL,  13UL, 14UL, 3UL,  13UL, 1UL,  1UL,  14UL, 11UL, 3UL,  14UL, 8UL,  9UL,  5UL,  6UL,  12UL, 5UL,  7UL,  2UL,
+    0UL,  8UL,  4UL,  8UL,  4UL,  9UL,  13UL, 6UL,  15UL, 8UL,  11UL, 11UL, 9UL,  2UL,  6UL,  12UL, 2UL,  9UL,  2UL,
+    6UL,  12UL, 2UL,  6UL,  4UL,  8UL,  4UL,  1UL,  4UL,  8UL,  15UL, 2UL,  8UL,  10UL, 6UL,  1UL,  5UL,  1UL,  10UL,
+    1UL,  3UL,  11UL, 12UL, 10UL, 6UL,  15UL, 1UL,  1UL,  6UL,  13UL, 8UL,  4UL,  3UL,  13UL, 7UL,  4UL,  8UL,  9UL,
+    8UL,  1UL,  12UL, 1UL,  8UL,  8UL,  4UL,  5UL,  15UL, 14UL, 8UL,  11UL, 0UL,  12UL, 14UL, 6UL,  2UL,  2UL,  3UL,
+    10UL, 9UL,  2UL,  10UL, 10UL, 13UL, 15UL, 3UL,  0UL,  7UL,  14UL, 13UL, 0UL,  6UL,  9UL,  15UL, 8UL,  12UL, 5UL,
+    1UL,  1UL,  5UL,  10UL, 11UL, 13UL, 12UL, 4UL,  6UL,  2UL,  12UL, 3UL,  2UL,  5UL,  5UL,  14UL, 6UL,  1UL,  7UL,
+    13UL, 9UL,  4UL,  3UL,  2UL,  10UL, 2UL,  15UL, 11UL, 9UL,  7UL,  1UL,  7UL,  14UL, 5UL,  1UL,  5UL,  14UL, 11UL,
+    6UL,  12UL, 12UL, 3UL,  0UL,  10UL, 11UL, 1UL,  0UL,  12UL, 8UL,  15UL, 15UL, 7UL,  10UL, 1UL,  15UL, 5UL,  1UL,
+    15UL, 3UL,  3UL,  6UL,  2UL,  12UL, 9UL,  7UL,  3UL,  11UL, 13UL, 12UL, 15UL, 12UL, 12UL, 2UL,  4UL,  12UL, 5UL,
+    10UL, 2UL,  13UL, 2UL,  6UL,  6UL,  13UL, 15UL, 1UL,  8UL,  6UL,  3UL,  13UL, 4UL,  9UL,  12UL, 12UL, 13UL, 11UL,
+    6UL,  15UL, 9UL,  7UL,  1UL,  5UL,  7UL,  1UL,  4UL,  5UL,  2UL,  12UL, 11UL, 9UL,  8UL,  9UL,  6UL,  15UL, 7UL,
+    14UL, 1UL,  6UL,  10UL, 12UL, 13UL, 1UL,  1UL,  13UL, 2UL,  2UL,  10UL, 9UL,  9UL,  14UL, 11UL, 9UL,  7UL,  8UL,
+    2UL,  3UL,  2UL,  0UL,  1UL,  14UL, 3UL,  5UL,  5UL,  0UL,  3UL,  1UL,  6UL,  5UL,  0UL,  15UL, 6UL,  14UL, 3UL,
+    10UL, 2UL,  7UL,  6UL,  10UL, 6UL,  4UL,  0UL,  6UL,  10UL, 1UL,  5UL,  4UL,  14UL, 5UL,  14UL, 0UL,  15UL, 15UL,
+    3UL,  12UL, 6UL,  9UL,  11UL, 2UL,  3UL,  8UL,  8UL,  10UL, 10UL, 11UL, 8UL,  5UL,  6UL,  13UL, 9UL,
+};
+uint32_t rand_arr_5_b5_w32_arr[1024] = {
+    1UL,  0UL,  24UL, 3UL,  1UL,  28UL, 29UL, 12UL, 27UL, 31UL, 18UL, 1UL,  31UL, 21UL, 12UL, 8UL,  24UL, 27UL, 26UL,
+    1UL,  26UL, 16UL, 7UL,  5UL,  15UL, 29UL, 29UL, 18UL, 5UL,  29UL, 2UL,  17UL, 16UL, 6UL,  31UL, 14UL, 22UL, 25UL,
+    22UL, 11UL, 9UL,  29UL, 29UL, 24UL, 8UL,  22UL, 16UL, 24UL, 4UL,  1UL,  7UL,  23UL, 0UL,  1UL,  14UL, 9UL,  1UL,
+    23UL, 10UL, 10UL, 18UL, 12UL, 16UL, 4UL,  1UL,  24UL, 30UL, 29UL, 15UL, 21UL, 4UL,  21UL, 26UL, 1UL,  11UL, 0UL,
+    22UL, 18UL, 8UL,  1UL,  20UL, 19UL, 9UL,  25UL, 14UL, 7UL,  28UL, 23UL, 7UL,  25UL, 4UL,  25UL, 3UL,  9UL,  20UL,
+    18UL, 3UL,  12UL, 8UL,  4UL,  25UL, 30UL, 26UL, 9UL,  21UL, 20UL, 15UL, 27UL, 30UL, 9UL,  5UL,  4UL,  30UL, 24UL,
+    2UL,  1UL,  7UL,  22UL, 18UL, 21UL, 6UL,  16UL, 26UL, 24UL, 3UL,  18UL, 17UL, 15UL, 8UL,  20UL, 7UL,  26UL, 24UL,
+    23UL, 5UL,  26UL, 25UL, 25UL, 3UL,  10UL, 21UL, 19UL, 19UL, 3UL,  1UL,  11UL, 25UL, 24UL, 25UL, 0UL,  28UL, 4UL,
+    27UL, 7UL,  9UL,  15UL, 4UL,  14UL, 25UL, 21UL, 7UL,  25UL, 12UL, 9UL,  13UL, 17UL, 5UL,  30UL, 28UL, 15UL, 17UL,
+    1UL,  23UL, 17UL, 27UL, 11UL, 19UL, 19UL, 19UL, 24UL, 30UL, 8UL,  25UL, 6UL,  27UL, 18UL, 15UL, 13UL, 20UL, 11UL,
+    8UL,  7UL,  22UL, 2UL,  5UL,  12UL, 3UL,  16UL, 4UL,  18UL, 4UL,  3UL,  27UL, 8UL,  27UL, 13UL, 5UL,  12UL, 1UL,
+    28UL, 17UL, 16UL, 19UL, 5UL,  6UL,  0UL,  22UL, 2UL,  9UL,  28UL, 17UL, 23UL, 17UL, 11UL, 12UL, 28UL, 17UL, 20UL,
+    7UL,  31UL, 1UL,  0UL,  23UL, 4UL,  1UL,  13UL, 6UL,  31UL, 10UL, 20UL, 10UL, 25UL, 2UL,  9UL,  5UL,  0UL,  5UL,
+    0UL,  19UL, 20UL, 3UL,  31UL, 2UL,  28UL, 2UL,  8UL,  24UL, 4UL,  5UL,  26UL, 26UL, 9UL,  0UL,  9UL,  21UL, 19UL,
+    17UL, 30UL, 7UL,  20UL, 6UL,  5UL,  16UL, 2UL,  22UL, 7UL,  25UL, 17UL, 21UL, 1UL,  4UL,  5UL,  20UL, 10UL, 23UL,
+    22UL, 5UL,  29UL, 11UL, 31UL, 16UL, 17UL, 13UL, 9UL,  25UL, 5UL,  1UL,  17UL, 0UL,  7UL,  15UL, 26UL, 18UL, 17UL,
+    25UL, 16UL, 27UL, 23UL, 31UL, 27UL, 30UL, 27UL, 14UL, 15UL, 20UL, 30UL, 0UL,  25UL, 15UL, 4UL,  23UL, 26UL, 1UL,
+    14UL, 27UL, 28UL, 30UL, 25UL, 25UL, 8UL,  16UL, 21UL, 6UL,  18UL, 29UL, 14UL, 0UL,  27UL, 24UL, 5UL,  8UL,  4UL,
+    13UL, 25UL, 13UL, 15UL, 2UL,  3UL,  0UL,  21UL, 30UL, 7UL,  26UL, 20UL, 29UL, 8UL,  2UL,  5UL,  1UL,  30UL, 19UL,
+    14UL, 12UL, 1UL,  23UL, 25UL, 5UL,  21UL, 28UL, 15UL, 28UL, 31UL, 25UL, 7UL,  7UL,  15UL, 23UL, 4UL,  2UL,  12UL,
+    7UL,  27UL, 8UL,  12UL, 13UL, 6UL,  11UL, 31UL, 15UL, 17UL, 27UL, 31UL, 18UL, 22UL, 16UL, 25UL, 25UL, 8UL,  30UL,
+    5UL,  13UL, 4UL,  7UL,  9UL,  20UL, 31UL, 28UL, 3UL,  8UL,  12UL, 11UL, 31UL, 15UL, 28UL, 4UL,  15UL, 18UL, 29UL,
+    0UL,  22UL, 13UL, 13UL, 4UL,  21UL, 18UL, 6UL,  30UL, 15UL, 11UL, 28UL, 12UL, 10UL, 25UL, 16UL, 0UL,  29UL, 15UL,
+    31UL, 15UL, 20UL, 11UL, 7UL,  11UL, 23UL, 3UL,  3UL,  20UL, 21UL, 4UL,  1UL,  16UL, 8UL,  0UL,  16UL, 23UL, 9UL,
+    8UL,  27UL, 27UL, 12UL, 3UL,  16UL, 17UL, 13UL, 5UL,  30UL, 1UL,  9UL,  23UL, 18UL, 1UL,  22UL, 0UL,  24UL, 13UL,
+    27UL, 5UL,  26UL, 10UL, 19UL, 12UL, 5UL,  13UL, 27UL, 16UL, 25UL, 14UL, 14UL, 20UL, 15UL, 23UL, 26UL, 8UL,  25UL,
+    4UL,  5UL,  28UL, 27UL, 1UL,  1UL,  27UL, 17UL, 19UL, 27UL, 24UL, 6UL,  29UL, 11UL, 15UL, 2UL,  10UL, 12UL, 25UL,
+    31UL, 2UL,  9UL,  18UL, 5UL,  18UL, 21UL, 26UL, 29UL, 13UL, 11UL, 7UL,  31UL, 8UL,  30UL, 18UL, 31UL, 11UL, 1UL,
+    20UL, 0UL,  7UL,  6UL,  18UL, 13UL, 8UL,  6UL,  4UL,  6UL,  22UL, 9UL,  25UL, 19UL, 15UL, 16UL, 26UL, 15UL, 26UL,
+    8UL,  18UL, 6UL,  28UL, 30UL, 4UL,  15UL, 6UL,  12UL, 5UL,  31UL, 27UL, 14UL, 18UL, 12UL, 9UL,  8UL,  4UL,  4UL,
+    1UL,  4UL,  21UL, 22UL, 2UL,  19UL, 31UL, 29UL, 27UL, 14UL, 4UL,  0UL,  22UL, 14UL, 27UL, 3UL,  13UL, 23UL, 27UL,
+    19UL, 13UL, 20UL, 2UL,  23UL, 24UL, 27UL, 16UL, 1UL,  30UL, 2UL,  26UL, 14UL, 21UL, 1UL,  25UL, 5UL,  13UL, 21UL,
+    2UL,  23UL, 18UL, 4UL,  25UL, 20UL, 14UL, 23UL, 7UL,  16UL, 2UL,  22UL, 9UL,  28UL, 28UL, 21UL, 27UL, 9UL,  28UL,
+    28UL, 9UL,  21UL, 11UL, 4UL,  18UL, 13UL, 26UL, 14UL, 7UL,  28UL, 5UL,  5UL,  15UL, 4UL,  23UL, 12UL, 25UL, 17UL,
+    29UL, 6UL,  22UL, 0UL,  19UL, 3UL,  4UL,  24UL, 2UL,  13UL, 20UL, 19UL, 21UL, 22UL, 11UL, 19UL, 1UL,  18UL, 17UL,
+    20UL, 16UL, 23UL, 8UL,  18UL, 10UL, 17UL, 24UL, 30UL, 7UL,  25UL, 27UL, 10UL, 9UL,  12UL, 20UL, 29UL, 7UL,  28UL,
+    23UL, 16UL, 6UL,  22UL, 7UL,  28UL, 13UL, 2UL,  7UL,  7UL,  18UL, 26UL, 30UL, 9UL,  4UL,  11UL, 20UL, 20UL, 26UL,
+    8UL,  5UL,  14UL, 28UL, 14UL, 30UL, 28UL, 9UL,  16UL, 1UL,  23UL, 10UL, 26UL, 10UL, 27UL, 21UL, 1UL,  27UL, 10UL,
+    31UL, 14UL, 9UL,  2UL,  30UL, 1UL,  25UL, 30UL, 7UL,  26UL, 30UL, 27UL, 23UL, 30UL, 2UL,  19UL, 10UL, 30UL, 31UL,
+    2UL,  19UL, 7UL,  11UL, 11UL, 27UL, 6UL,  26UL, 6UL,  15UL, 28UL, 3UL,  2UL,  2UL,  30UL, 22UL, 8UL,  0UL,  24UL,
+    5UL,  28UL, 12UL, 28UL, 5UL,  21UL, 22UL, 27UL, 9UL,  14UL, 17UL, 7UL,  20UL, 24UL, 20UL, 31UL, 25UL, 31UL, 23UL,
+    22UL, 17UL, 20UL, 3UL,  28UL, 19UL, 14UL, 27UL, 0UL,  5UL,  3UL,  10UL, 24UL, 7UL,  19UL, 26UL, 17UL, 10UL, 3UL,
+    21UL, 25UL, 18UL, 18UL, 7UL,  23UL, 11UL, 16UL, 26UL, 14UL, 20UL, 21UL, 19UL, 31UL, 6UL,  22UL, 2UL,  20UL, 31UL,
+    12UL, 24UL, 31UL, 27UL, 10UL, 25UL, 21UL, 18UL, 13UL, 2UL,  19UL, 7UL,  26UL, 25UL, 1UL,  3UL,  0UL,  7UL,  20UL,
+    7UL,  23UL, 26UL, 28UL, 5UL,  29UL, 0UL,  9UL,  24UL, 4UL,  31UL, 7UL,  7UL,  28UL, 12UL, 17UL, 9UL,  27UL, 6UL,
+    5UL,  28UL, 27UL, 26UL, 27UL, 14UL, 22UL, 7UL,  24UL, 2UL,  11UL, 15UL, 26UL, 1UL,  10UL, 29UL, 21UL, 20UL, 29UL,
+    4UL,  9UL,  13UL, 18UL, 17UL, 10UL, 26UL, 22UL, 22UL, 14UL, 28UL, 24UL, 10UL, 24UL, 13UL, 18UL, 18UL, 22UL, 21UL,
+    22UL, 3UL,  26UL, 2UL,  23UL, 2UL,  27UL, 19UL, 13UL, 22UL, 6UL,  22UL, 23UL, 21UL, 28UL, 26UL, 18UL, 28UL, 20UL,
+    30UL, 22UL, 3UL,  0UL,  18UL, 5UL,  25UL, 30UL, 23UL, 26UL, 19UL, 1UL,  15UL, 25UL, 28UL, 1UL,  0UL,  29UL, 31UL,
+    27UL, 22UL, 19UL, 15UL, 16UL, 5UL,  28UL, 28UL, 31UL, 15UL, 4UL,  30UL, 25UL, 14UL, 24UL, 6UL,  20UL, 9UL,  3UL,
+    20UL, 14UL, 5UL,  2UL,  27UL, 6UL,  6UL,  4UL,  22UL, 13UL, 5UL,  12UL, 6UL,  11UL, 3UL,  20UL, 25UL, 7UL,  10UL,
+    23UL, 31UL, 4UL,  21UL, 6UL,  1UL,  25UL, 30UL, 26UL, 17UL, 4UL,  14UL, 24UL, 28UL, 31UL, 24UL, 1UL,  20UL, 3UL,
+    6UL,  23UL, 24UL, 13UL, 30UL, 28UL, 7UL,  24UL, 30UL, 22UL, 27UL, 23UL, 24UL, 7UL,  9UL,  14UL, 5UL,  25UL, 24UL,
+    3UL,  5UL,  23UL, 10UL, 0UL,  18UL, 21UL, 10UL, 17UL, 9UL,  26UL, 20UL, 17UL, 27UL, 24UL, 27UL, 28UL,
+};
+uint32_t rand_arr_6_b6_w32_arr[1024] = {
+    54UL, 22UL, 30UL, 25UL, 58UL, 44UL, 20UL, 14UL, 33UL, 31UL, 61UL, 15UL, 31UL, 38UL, 52UL, 22UL, 41UL, 56UL, 36UL,
+    18UL, 28UL, 19UL, 34UL, 55UL, 28UL, 27UL, 15UL, 58UL, 62UL, 57UL, 15UL, 1UL,  38UL, 22UL, 48UL, 1UL,  35UL, 31UL,
+    33UL, 25UL, 63UL, 54UL, 11UL, 14UL, 31UL, 36UL, 7UL,  16UL, 58UL, 34UL, 21UL, 32UL, 16UL, 29UL, 54UL, 3UL,  14UL,
+    1UL,  37UL, 44UL, 7UL,  62UL, 28UL, 10UL, 16UL, 21UL, 21UL, 8UL,  31UL, 2UL,  19UL, 19UL, 41UL, 25UL, 18UL, 33UL,
+    5UL,  38UL, 0UL,  49UL, 46UL, 25UL, 53UL, 30UL, 31UL, 8UL,  48UL, 29UL, 39UL, 11UL, 52UL, 12UL, 5UL,  16UL, 63UL,
+    24UL, 59UL, 18UL, 29UL, 27UL, 36UL, 63UL, 60UL, 22UL, 25UL, 2UL,  55UL, 51UL, 60UL, 60UL, 55UL, 26UL, 58UL, 25UL,
+    63UL, 2UL,  26UL, 11UL, 11UL, 34UL, 12UL, 26UL, 48UL, 17UL, 32UL, 61UL, 46UL, 35UL, 58UL, 30UL, 46UL, 48UL, 33UL,
+    8UL,  26UL, 19UL, 42UL, 57UL, 43UL, 44UL, 52UL, 43UL, 39UL, 33UL, 13UL, 55UL, 3UL,  31UL, 59UL, 14UL, 10UL, 45UL,
+    22UL, 47UL, 22UL, 35UL, 51UL, 24UL, 42UL, 1UL,  46UL, 38UL, 0UL,  25UL, 14UL, 38UL, 19UL, 36UL, 40UL, 60UL, 12UL,
+    14UL, 21UL, 7UL,  52UL, 56UL, 27UL, 22UL, 11UL, 42UL, 34UL, 25UL, 11UL, 2UL,  36UL, 48UL, 57UL, 48UL, 23UL, 57UL,
+    56UL, 8UL,  61UL, 35UL, 38UL, 8UL,  57UL, 35UL, 55UL, 47UL, 42UL, 32UL, 32UL, 44UL, 22UL, 6UL,  0UL,  34UL, 22UL,
+    16UL, 7UL,  42UL, 62UL, 0UL,  56UL, 56UL, 26UL, 16UL, 25UL, 26UL, 15UL, 16UL, 60UL, 50UL, 19UL, 11UL, 47UL, 6UL,
+    36UL, 25UL, 25UL, 40UL, 14UL, 45UL, 21UL, 29UL, 54UL, 22UL, 43UL, 50UL, 31UL, 21UL, 11UL, 35UL, 14UL, 53UL, 1UL,
+    25UL, 51UL, 9UL,  49UL, 45UL, 58UL, 19UL, 21UL, 36UL, 45UL, 52UL, 54UL, 34UL, 13UL, 47UL, 46UL, 49UL, 24UL, 37UL,
+    1UL,  23UL, 59UL, 49UL, 12UL, 26UL, 7UL,  26UL, 3UL,  24UL, 23UL, 34UL, 4UL,  27UL, 4UL,  36UL, 48UL, 56UL, 38UL,
+    26UL, 51UL, 7UL,  11UL, 17UL, 24UL, 41UL, 19UL, 25UL, 5UL,  42UL, 41UL, 15UL, 14UL, 27UL, 19UL, 27UL, 19UL, 49UL,
+    36UL, 26UL, 14UL, 52UL, 26UL, 33UL, 11UL, 11UL, 59UL, 10UL, 21UL, 51UL, 57UL, 34UL, 49UL, 34UL, 0UL,  62UL, 59UL,
+    24UL, 19UL, 23UL, 60UL, 28UL, 9UL,  37UL, 56UL, 18UL, 62UL, 12UL, 24UL, 26UL, 61UL, 54UL, 12UL, 14UL, 42UL, 56UL,
+    15UL, 12UL, 59UL, 10UL, 2UL,  32UL, 60UL, 30UL, 10UL, 61UL, 40UL, 17UL, 30UL, 0UL,  32UL, 26UL, 59UL, 61UL, 53UL,
+    49UL, 33UL, 9UL,  41UL, 10UL, 43UL, 45UL, 7UL,  20UL, 9UL,  57UL, 50UL, 40UL, 36UL, 1UL,  41UL, 25UL, 6UL,  6UL,
+    50UL, 1UL,  12UL, 20UL, 18UL, 21UL, 36UL, 54UL, 0UL,  30UL, 56UL, 31UL, 38UL, 28UL, 11UL, 15UL, 10UL, 8UL,  47UL,
+    33UL, 42UL, 18UL, 1UL,  17UL, 3UL,  5UL,  30UL, 28UL, 62UL, 37UL, 27UL, 24UL, 17UL, 12UL, 49UL, 38UL, 7UL,  63UL,
+    40UL, 55UL, 21UL, 20UL, 51UL, 40UL, 33UL, 16UL, 46UL, 56UL, 35UL, 26UL, 28UL, 0UL,  33UL, 55UL, 11UL, 30UL, 43UL,
+    41UL, 48UL, 42UL, 7UL,  39UL, 15UL, 32UL, 46UL, 51UL, 11UL, 13UL, 45UL, 60UL, 56UL, 31UL, 49UL, 37UL, 40UL, 4UL,
+    2UL,  26UL, 28UL, 0UL,  48UL, 17UL, 11UL, 58UL, 21UL, 50UL, 7UL,  11UL, 45UL, 38UL, 62UL, 44UL, 37UL, 1UL,  58UL,
+    56UL, 33UL, 21UL, 49UL, 51UL, 48UL, 30UL, 7UL,  10UL, 28UL, 57UL, 17UL, 3UL,  4UL,  34UL, 55UL, 25UL, 5UL,  30UL,
+    17UL, 25UL, 43UL, 27UL, 57UL, 16UL, 7UL,  10UL, 13UL, 38UL, 49UL, 21UL, 52UL, 34UL, 2UL,  40UL, 13UL, 24UL, 29UL,
+    50UL, 43UL, 16UL, 13UL, 45UL, 59UL, 1UL,  35UL, 56UL, 50UL, 60UL, 14UL, 7UL,  41UL, 5UL,  51UL, 39UL, 33UL, 29UL,
+    32UL, 22UL, 56UL, 33UL, 18UL, 28UL, 49UL, 10UL, 43UL, 3UL,  59UL, 60UL, 57UL, 9UL,  4UL,  18UL, 38UL, 59UL, 54UL,
+    7UL,  14UL, 0UL,  45UL, 60UL, 18UL, 19UL, 11UL, 35UL, 39UL, 18UL, 51UL, 31UL, 33UL, 36UL, 39UL, 12UL, 22UL, 43UL,
+    41UL, 7UL,  62UL, 35UL, 0UL,  12UL, 35UL, 58UL, 7UL,  14UL, 30UL, 61UL, 16UL, 15UL, 35UL, 48UL, 6UL,  34UL, 31UL,
+    39UL, 46UL, 42UL, 34UL, 41UL, 7UL,  20UL, 58UL, 46UL, 31UL, 51UL, 28UL, 56UL, 22UL, 52UL, 39UL, 6UL,  6UL,  12UL,
+    20UL, 16UL, 31UL, 3UL,  41UL, 32UL, 58UL, 10UL, 41UL, 8UL,  52UL, 37UL, 3UL,  60UL, 39UL, 5UL,  25UL, 12UL, 22UL,
+    9UL,  16UL, 55UL, 52UL, 51UL, 35UL, 17UL, 14UL, 46UL, 37UL, 63UL, 46UL, 51UL, 11UL, 10UL, 1UL,  61UL, 58UL, 44UL,
+    2UL,  4UL,  52UL, 20UL, 39UL, 25UL, 37UL, 40UL, 2UL,  19UL, 34UL, 38UL, 11UL, 17UL, 47UL, 51UL, 31UL, 7UL,  3UL,
+    16UL, 28UL, 4UL,  29UL, 52UL, 23UL, 13UL, 55UL, 12UL, 2UL,  5UL,  31UL, 24UL, 17UL, 53UL, 7UL,  54UL, 12UL, 61UL,
+    2UL,  8UL,  27UL, 28UL, 43UL, 45UL, 32UL, 21UL, 52UL, 20UL, 62UL, 45UL, 14UL, 30UL, 55UL, 32UL, 1UL,  36UL, 16UL,
+    44UL, 24UL, 22UL, 55UL, 30UL, 22UL, 13UL, 58UL, 13UL, 16UL, 2UL,  54UL, 19UL, 3UL,  44UL, 35UL, 29UL, 11UL, 53UL,
+    7UL,  50UL, 62UL, 10UL, 31UL, 61UL, 10UL, 22UL, 47UL, 24UL, 10UL, 47UL, 18UL, 14UL, 38UL, 34UL, 33UL, 32UL, 62UL,
+    0UL,  43UL, 57UL, 7UL,  32UL, 60UL, 8UL,  57UL, 40UL, 51UL, 26UL, 45UL, 58UL, 25UL, 17UL, 7UL,  46UL, 0UL,  35UL,
+    27UL, 46UL, 51UL, 46UL, 15UL, 9UL,  27UL, 50UL, 37UL, 38UL, 62UL, 43UL, 40UL, 56UL, 55UL, 31UL, 11UL, 37UL, 63UL,
+    32UL, 13UL, 44UL, 60UL, 61UL, 49UL, 22UL, 25UL, 14UL, 11UL, 27UL, 17UL, 50UL, 8UL,  51UL, 30UL, 31UL, 57UL, 32UL,
+    3UL,  50UL, 32UL, 30UL, 4UL,  42UL, 9UL,  45UL, 19UL, 27UL, 56UL, 34UL, 2UL,  19UL, 33UL, 11UL, 1UL,  5UL,  10UL,
+    20UL, 31UL, 31UL, 32UL, 50UL, 33UL, 17UL, 47UL, 34UL, 33UL, 15UL, 8UL,  12UL, 38UL, 31UL, 11UL, 31UL, 11UL, 63UL,
+    1UL,  40UL, 13UL, 56UL, 63UL, 50UL, 35UL, 42UL, 59UL, 7UL,  46UL, 21UL, 39UL, 54UL, 57UL, 32UL, 46UL, 25UL, 11UL,
+    49UL, 57UL, 29UL, 38UL, 51UL, 18UL, 51UL, 58UL, 41UL, 4UL,  39UL, 23UL, 23UL, 43UL, 39UL, 49UL, 0UL,  6UL,  0UL,
+    52UL, 15UL, 25UL, 34UL, 9UL,  40UL, 5UL,  57UL, 63UL, 13UL, 37UL, 41UL, 35UL, 55UL, 43UL, 58UL, 28UL, 42UL, 49UL,
+    17UL, 59UL, 29UL, 23UL, 17UL, 58UL, 19UL, 20UL, 46UL, 29UL, 2UL,  48UL, 56UL, 32UL, 34UL, 27UL, 28UL, 11UL, 11UL,
+    63UL, 38UL, 27UL, 56UL, 34UL, 51UL, 50UL, 0UL,  34UL, 11UL, 6UL,  23UL, 45UL, 39UL, 48UL, 4UL,  11UL, 59UL, 15UL,
+    43UL, 37UL, 5UL,  62UL, 11UL, 62UL, 15UL, 57UL, 55UL, 20UL, 40UL, 23UL, 24UL, 28UL, 49UL, 23UL, 36UL, 7UL,  20UL,
+    42UL, 24UL, 29UL, 31UL, 6UL,  56UL, 42UL, 13UL, 50UL, 14UL, 17UL, 44UL, 62UL, 41UL, 20UL, 1UL,  6UL,  43UL, 28UL,
+    27UL, 54UL, 21UL, 50UL, 16UL, 36UL, 38UL, 45UL, 43UL, 61UL, 43UL, 26UL, 55UL, 61UL, 39UL, 36UL, 58UL, 49UL, 38UL,
+    51UL, 50UL, 9UL,  6UL,  7UL,  40UL, 53UL, 51UL, 45UL, 58UL, 32UL, 37UL, 6UL,  11UL, 29UL, 43UL, 60UL, 45UL, 25UL,
+    62UL, 16UL, 29UL, 19UL, 10UL, 60UL, 47UL, 47UL, 23UL, 19UL, 57UL, 9UL,  14UL, 14UL, 34UL, 17UL, 46UL,
+};
+uint32_t rand_arr_7_b7_w32_arr[1024] = {
+    108UL, 56UL,  59UL,  20UL,  118UL, 99UL,  59UL,  78UL,  93UL,  2UL,   66UL,  3UL,   72UL,  79UL,  82UL,  31UL,
+    19UL,  4UL,   122UL, 81UL,  57UL,  92UL,  126UL, 88UL,  10UL,  116UL, 22UL,  124UL, 57UL,  27UL,  35UL,  8UL,
+    85UL,  107UL, 101UL, 26UL,  96UL,  108UL, 35UL,  14UL,  123UL, 109UL, 118UL, 117UL, 90UL,  107UL, 65UL,  82UL,
+    47UL,  118UL, 123UL, 25UL,  110UL, 57UL,  77UL,  8UL,   58UL,  119UL, 22UL,  77UL,  9UL,   58UL,  88UL,  34UL,
+    9UL,   58UL,  7UL,   17UL,  82UL,  111UL, 96UL,  43UL,  103UL, 69UL,  8UL,   37UL,  75UL,  82UL,  26UL,  3UL,
+    94UL,  79UL,  81UL,  51UL,  100UL, 32UL,  43UL,  122UL, 78UL,  103UL, 48UL,  99UL,  26UL,  65UL,  22UL,  62UL,
+    86UL,  98UL,  88UL,  73UL,  90UL,  107UL, 6UL,   3UL,   78UL,  23UL,  68UL,  108UL, 83UL,  17UL,  52UL,  18UL,
+    62UL,  6UL,   26UL,  43UL,  98UL,  27UL,  101UL, 48UL,  112UL, 5UL,   6UL,   119UL, 27UL,  65UL,  125UL, 109UL,
+    89UL,  89UL,  40UL,  66UL,  88UL,  86UL,  49UL,  109UL, 27UL,  51UL,  5UL,   104UL, 28UL,  75UL,  68UL,  9UL,
+    33UL,  33UL,  87UL,  22UL,  69UL,  119UL, 61UL,  28UL,  10UL,  23UL,  16UL,  68UL,  57UL,  65UL,  62UL,  31UL,
+    35UL,  76UL,  52UL,  67UL,  102UL, 118UL, 29UL,  127UL, 88UL,  15UL,  21UL,  56UL,  62UL,  102UL, 72UL,  59UL,
+    97UL,  98UL,  122UL, 90UL,  87UL,  73UL,  83UL,  125UL, 52UL,  92UL,  125UL, 90UL,  100UL, 63UL,  5UL,   126UL,
+    49UL,  54UL,  36UL,  38UL,  49UL,  106UL, 112UL, 23UL,  38UL,  64UL,  54UL,  11UL,  117UL, 41UL,  105UL, 62UL,
+    72UL,  93UL,  87UL,  34UL,  92UL,  58UL,  36UL,  62UL,  99UL,  2UL,   4UL,   110UL, 87UL,  96UL,  7UL,   114UL,
+    29UL,  95UL,  42UL,  74UL,  116UL, 62UL,  76UL,  0UL,   43UL,  91UL,  124UL, 12UL,  27UL,  31UL,  60UL,  118UL,
+    50UL,  58UL,  74UL,  1UL,   118UL, 26UL,  111UL, 54UL,  114UL, 107UL, 27UL,  99UL,  90UL,  54UL,  77UL,  52UL,
+    35UL,  82UL,  40UL,  35UL,  39UL,  24UL,  46UL,  57UL,  22UL,  60UL,  40UL,  15UL,  127UL, 96UL,  98UL,  99UL,
+    6UL,   81UL,  14UL,  80UL,  101UL, 8UL,   71UL,  8UL,   32UL,  63UL,  10UL,  14UL,  7UL,   59UL,  104UL, 44UL,
+    40UL,  87UL,  54UL,  60UL,  75UL,  104UL, 113UL, 26UL,  4UL,   58UL,  115UL, 119UL, 89UL,  62UL,  20UL,  14UL,
+    24UL,  51UL,  41UL,  18UL,  104UL, 58UL,  94UL,  42UL,  83UL,  54UL,  70UL,  93UL,  34UL,  63UL,  42UL,  122UL,
+    71UL,  37UL,  110UL, 41UL,  88UL,  91UL,  56UL,  64UL,  39UL,  50UL,  74UL,  37UL,  69UL,  28UL,  123UL, 6UL,
+    7UL,   52UL,  31UL,  21UL,  3UL,   67UL,  97UL,  78UL,  34UL,  54UL,  80UL,  80UL,  64UL,  54UL,  74UL,  88UL,
+    15UL,  50UL,  122UL, 61UL,  4UL,   54UL,  61UL,  97UL,  127UL, 118UL, 62UL,  58UL,  34UL,  77UL,  81UL,  2UL,
+    98UL,  24UL,  48UL,  13UL,  122UL, 63UL,  56UL,  80UL,  94UL,  114UL, 18UL,  25UL,  49UL,  94UL,  123UL, 45UL,
+    115UL, 19UL,  121UL, 44UL,  124UL, 103UL, 120UL, 60UL,  7UL,   96UL,  104UL, 16UL,  9UL,   127UL, 44UL,  47UL,
+    96UL,  63UL,  126UL, 2UL,   38UL,  119UL, 126UL, 35UL,  63UL,  63UL,  35UL,  99UL,  18UL,  126UL, 27UL,  3UL,
+    83UL,  75UL,  109UL, 51UL,  28UL,  69UL,  26UL,  3UL,   101UL, 72UL,  0UL,   85UL,  93UL,  7UL,   38UL,  123UL,
+    79UL,  1UL,   74UL,  36UL,  106UL, 76UL,  55UL,  126UL, 91UL,  4UL,   22UL,  86UL,  61UL,  103UL, 120UL, 127UL,
+    10UL,  67UL,  120UL, 126UL, 62UL,  94UL,  7UL,   29UL,  6UL,   106UL, 84UL,  122UL, 87UL,  8UL,   100UL, 47UL,
+    27UL,  17UL,  39UL,  81UL,  111UL, 9UL,   21UL,  23UL,  42UL,  12UL,  10UL,  59UL,  115UL, 122UL, 102UL, 53UL,
+    45UL,  56UL,  61UL,  27UL,  7UL,   84UL,  103UL, 31UL,  101UL, 121UL, 28UL,  48UL,  75UL,  85UL,  8UL,   57UL,
+    16UL,  41UL,  21UL,  120UL, 97UL,  102UL, 63UL,  112UL, 47UL,  6UL,   47UL,  18UL,  21UL,  92UL,  5UL,   40UL,
+    13UL,  77UL,  109UL, 76UL,  40UL,  43UL,  49UL,  55UL,  78UL,  53UL,  103UL, 103UL, 115UL, 12UL,  59UL,  46UL,
+    104UL, 45UL,  20UL,  97UL,  39UL,  19UL,  121UL, 86UL,  28UL,  34UL,  34UL,  23UL,  16UL,  56UL,  9UL,   126UL,
+    28UL,  111UL, 4UL,   89UL,  112UL, 58UL,  28UL,  40UL,  112UL, 7UL,   47UL,  16UL,  127UL, 77UL,  20UL,  83UL,
+    32UL,  7UL,   12UL,  85UL,  26UL,  29UL,  121UL, 4UL,   62UL,  60UL,  8UL,   24UL,  84UL,  121UL, 44UL,  62UL,
+    82UL,  13UL,  29UL,  96UL,  108UL, 8UL,   39UL,  54UL,  78UL,  25UL,  23UL,  31UL,  82UL,  49UL,  47UL,  114UL,
+    105UL, 124UL, 105UL, 111UL, 50UL,  101UL, 39UL,  48UL,  17UL,  27UL,  3UL,   56UL,  102UL, 72UL,  83UL,  53UL,
+    13UL,  56UL,  68UL,  53UL,  97UL,  4UL,   18UL,  106UL, 49UL,  75UL,  93UL,  108UL, 96UL,  72UL,  78UL,  68UL,
+    32UL,  104UL, 95UL,  112UL, 89UL,  80UL,  37UL,  86UL,  101UL, 52UL,  53UL,  87UL,  30UL,  21UL,  72UL,  79UL,
+    0UL,   125UL, 107UL, 81UL,  108UL, 100UL, 4UL,   126UL, 107UL, 106UL, 38UL,  56UL,  32UL,  27UL,  108UL, 15UL,
+    0UL,   48UL,  90UL,  75UL,  45UL,  54UL,  76UL,  1UL,   55UL,  46UL,  111UL, 12UL,  14UL,  45UL,  81UL,  48UL,
+    93UL,  11UL,  124UL, 104UL, 25UL,  118UL, 85UL,  18UL,  6UL,   0UL,   68UL,  42UL,  11UL,  74UL,  42UL,  47UL,
+    39UL,  65UL,  104UL, 16UL,  119UL, 44UL,  64UL,  2UL,   81UL,  31UL,  24UL,  72UL,  118UL, 89UL,  78UL,  36UL,
+    58UL,  79UL,  23UL,  68UL,  76UL,  84UL,  84UL,  26UL,  117UL, 65UL,  107UL, 63UL,  54UL,  69UL,  117UL, 20UL,
+    10UL,  60UL,  51UL,  3UL,   35UL,  74UL,  108UL, 112UL, 11UL,  102UL, 40UL,  64UL,  61UL,  117UL, 42UL,  124UL,
+    27UL,  119UL, 86UL,  88UL,  117UL, 115UL, 94UL,  18UL,  48UL,  30UL,  63UL,  77UL,  106UL, 97UL,  106UL, 85UL,
+    25UL,  93UL,  40UL,  108UL, 62UL,  59UL,  77UL,  34UL,  30UL,  107UL, 27UL,  99UL,  15UL,  77UL,  47UL,  14UL,
+    93UL,  105UL, 28UL,  23UL,  14UL,  43UL,  14UL,  109UL, 57UL,  83UL,  46UL,  48UL,  18UL,  94UL,  73UL,  60UL,
+    72UL,  24UL,  48UL,  93UL,  89UL,  59UL,  58UL,  14UL,  38UL,  82UL,  5UL,   84UL,  96UL,  33UL,  66UL,  60UL,
+    123UL, 93UL,  101UL, 17UL,  111UL, 78UL,  118UL, 1UL,   63UL,  116UL, 14UL,  97UL,  87UL,  71UL,  63UL,  92UL,
+    86UL,  47UL,  3UL,   8UL,   123UL, 13UL,  7UL,   27UL,  71UL,  110UL, 3UL,   90UL,  99UL,  26UL,  78UL,  88UL,
+    101UL, 16UL,  108UL, 103UL, 89UL,  100UL, 116UL, 34UL,  101UL, 20UL,  41UL,  48UL,  25UL,  30UL,  105UL, 108UL,
+    77UL,  19UL,  67UL,  119UL, 6UL,   9UL,   56UL,  78UL,  24UL,  1UL,   9UL,   85UL,  6UL,   42UL,  68UL,  55UL,
+    61UL,  120UL, 32UL,  79UL,  23UL,  59UL,  9UL,   112UL, 54UL,  95UL,  17UL,  34UL,  123UL, 63UL,  16UL,  50UL,
+    39UL,  62UL,  10UL,  5UL,   30UL,  106UL, 93UL,  85UL,  0UL,   44UL,  45UL,  106UL, 117UL, 1UL,   73UL,  50UL,
+    30UL,  88UL,  110UL, 72UL,  16UL,  93UL,  65UL,  19UL,  91UL,  56UL,  45UL,  18UL,  41UL,  73UL,  3UL,   96UL,
+    49UL,  118UL, 115UL, 102UL, 24UL,  115UL, 10UL,  4UL,   108UL, 82UL,  32UL,  50UL,  6UL,   44UL,  37UL,  79UL,
+    105UL, 104UL, 1UL,   101UL, 65UL,  86UL,  65UL,  112UL, 12UL,  3UL,   85UL,  116UL, 72UL,  65UL,  88UL,  33UL,
+    58UL,  10UL,  64UL,  75UL,  43UL,  122UL, 124UL, 117UL, 7UL,   37UL,  81UL,  123UL, 52UL,  114UL, 19UL,  9UL,
+    22UL,  21UL,  34UL,  27UL,  74UL,  86UL,  3UL,   104UL, 75UL,  67UL,  6UL,   89UL,  67UL,  70UL,  67UL,  101UL,
+    2UL,   7UL,   88UL,  35UL,  89UL,  90UL,  118UL, 115UL, 94UL,  66UL,  72UL,  56UL,  117UL, 67UL,  50UL,  70UL,
+    19UL,  74UL,  18UL,  60UL,  20UL,  116UL, 49UL,  88UL,  108UL, 117UL, 14UL,  126UL, 65UL,  54UL,  71UL,  18UL,
+    97UL,  85UL,  21UL,  14UL,  121UL, 64UL,  81UL,  100UL, 29UL,  32UL,  1UL,   116UL, 96UL,  6UL,   127UL, 97UL,
+};
+uint32_t rand_arr_8_b8_w32_arr[1024] = {
+    204UL, 196UL, 69UL,  154UL, 205UL, 214UL, 164UL, 21UL,  126UL, 58UL,  79UL,  6UL,   33UL,  130UL, 171UL, 113UL,
+    240UL, 226UL, 107UL, 18UL,  179UL, 173UL, 203UL, 58UL,  75UL,  68UL,  222UL, 64UL,  91UL,  47UL,  80UL,  193UL,
+    56UL,  75UL,  123UL, 23UL,  153UL, 159UL, 171UL, 204UL, 68UL,  72UL,  243UL, 140UL, 178UL, 79UL,  186UL, 31UL,
+    151UL, 62UL,  141UL, 93UL,  32UL,  26UL,  153UL, 211UL, 103UL, 4UL,   28UL,  13UL,  69UL,  253UL, 59UL,  63UL,
+    33UL,  62UL,  54UL,  163UL, 132UL, 43UL,  86UL,  190UL, 142UL, 112UL, 84UL,  248UL, 100UL, 74UL,  76UL,  51UL,
+    206UL, 51UL,  11UL,  14UL,  71UL,  185UL, 118UL, 24UL,  201UL, 148UL, 132UL, 95UL,  89UL,  212UL, 188UL, 85UL,
+    171UL, 70UL,  172UL, 116UL, 145UL, 116UL, 11UL,  20UL,  228UL, 211UL, 159UL, 213UL, 248UL, 86UL,  127UL, 55UL,
+    211UL, 166UL, 248UL, 228UL, 195UL, 0UL,   38UL,  18UL,  139UL, 236UL, 158UL, 6UL,   187UL, 26UL,  167UL, 62UL,
+    78UL,  68UL,  202UL, 43UL,  253UL, 198UL, 25UL,  10UL,  144UL, 144UL, 111UL, 253UL, 118UL, 161UL, 8UL,   61UL,
+    18UL,  215UL, 242UL, 137UL, 217UL, 57UL,  24UL,  227UL, 183UL, 37UL,  228UL, 213UL, 108UL, 175UL, 129UL, 27UL,
+    228UL, 94UL,  193UL, 201UL, 145UL, 255UL, 158UL, 233UL, 38UL,  37UL,  108UL, 20UL,  132UL, 93UL,  211UL, 53UL,
+    160UL, 44UL,  184UL, 141UL, 146UL, 21UL,  186UL, 228UL, 39UL,  75UL,  179UL, 188UL, 26UL,  153UL, 145UL, 178UL,
+    189UL, 51UL,  180UL, 27UL,  255UL, 18UL,  236UL, 79UL,  125UL, 170UL, 132UL, 168UL, 185UL, 143UL, 0UL,   255UL,
+    81UL,  57UL,  74UL,  189UL, 51UL,  146UL, 38UL,  246UL, 212UL, 137UL, 81UL,  181UL, 140UL, 201UL, 87UL,  169UL,
+    28UL,  66UL,  71UL,  76UL,  193UL, 105UL, 7UL,   58UL,  220UL, 72UL,  72UL,  85UL,  35UL,  71UL,  221UL, 158UL,
+    17UL,  115UL, 243UL, 156UL, 187UL, 184UL, 4UL,   138UL, 132UL, 72UL,  128UL, 73UL,  17UL,  130UL, 122UL, 174UL,
+    41UL,  230UL, 248UL, 234UL, 145UL, 223UL, 232UL, 116UL, 90UL,  45UL,  184UL, 121UL, 8UL,   237UL, 96UL,  188UL,
+    8UL,   234UL, 229UL, 21UL,  199UL, 40UL,  185UL, 181UL, 108UL, 64UL,  176UL, 95UL,  42UL,  99UL,  35UL,  214UL,
+    30UL,  151UL, 68UL,  168UL, 233UL, 165UL, 169UL, 233UL, 220UL, 14UL,  165UL, 170UL, 70UL,  169UL, 147UL, 114UL,
+    18UL,  139UL, 52UL,  254UL, 169UL, 63UL,  235UL, 154UL, 192UL, 34UL,  93UL,  115UL, 213UL, 163UL, 50UL,  201UL,
+    120UL, 226UL, 250UL, 151UL, 41UL,  147UL, 176UL, 93UL,  92UL,  134UL, 61UL,  13UL,  127UL, 24UL,  0UL,   131UL,
+    210UL, 217UL, 245UL, 13UL,  127UL, 31UL,  135UL, 212UL, 30UL,  147UL, 133UL, 30UL,  194UL, 110UL, 130UL, 66UL,
+    22UL,  198UL, 179UL, 199UL, 157UL, 12UL,  102UL, 166UL, 161UL, 242UL, 19UL,  118UL, 50UL,  150UL, 126UL, 203UL,
+    205UL, 101UL, 119UL, 51UL,  253UL, 180UL, 35UL,  50UL,  184UL, 41UL,  188UL, 162UL, 46UL,  117UL, 102UL, 94UL,
+    177UL, 199UL, 198UL, 152UL, 243UL, 130UL, 72UL,  117UL, 254UL, 104UL, 199UL, 150UL, 59UL,  22UL,  88UL,  238UL,
+    21UL,  122UL, 152UL, 245UL, 123UL, 191UL, 96UL,  215UL, 193UL, 103UL, 193UL, 7UL,   21UL,  196UL, 61UL,  54UL,
+    178UL, 152UL, 80UL,  77UL,  24UL,  244UL, 22UL,  227UL, 9UL,   236UL, 110UL, 236UL, 248UL, 81UL,  172UL, 207UL,
+    48UL,  22UL,  248UL, 204UL, 97UL,  223UL, 74UL,  111UL, 97UL,  129UL, 13UL,  50UL,  66UL,  104UL, 169UL, 142UL,
+    238UL, 175UL, 247UL, 6UL,   122UL, 83UL,  54UL,  201UL, 60UL,  5UL,   161UL, 111UL, 148UL, 162UL, 38UL,  116UL,
+    153UL, 196UL, 252UL, 95UL,  2UL,   3UL,   33UL,  193UL, 194UL, 240UL, 15UL,  79UL,  182UL, 180UL, 168UL, 228UL,
+    245UL, 89UL,  200UL, 85UL,  26UL,  240UL, 195UL, 36UL,  174UL, 14UL,  92UL,  124UL, 159UL, 201UL, 213UL, 157UL,
+    204UL, 41UL,  54UL,  107UL, 224UL, 192UL, 132UL, 69UL,  136UL, 181UL, 92UL,  120UL, 223UL, 247UL, 224UL, 112UL,
+    134UL, 205UL, 190UL, 43UL,  66UL,  224UL, 117UL, 26UL,  199UL, 94UL,  72UL,  145UL, 10UL,  28UL,  64UL,  166UL,
+    44UL,  84UL,  250UL, 92UL,  220UL, 83UL,  12UL,  54UL,  239UL, 180UL, 225UL, 220UL, 55UL,  173UL, 183UL, 51UL,
+    46UL,  39UL,  233UL, 165UL, 46UL,  236UL, 125UL, 34UL,  34UL,  139UL, 150UL, 184UL, 77UL,  10UL,  53UL,  128UL,
+    126UL, 191UL, 167UL, 169UL, 14UL,  209UL, 172UL, 174UL, 185UL, 203UL, 188UL, 71UL,  21UL,  14UL,  182UL, 254UL,
+    186UL, 170UL, 145UL, 217UL, 152UL, 76UL,  209UL, 103UL, 16UL,  19UL,  117UL, 0UL,   129UL, 216UL, 69UL,  0UL,
+    16UL,  190UL, 78UL,  170UL, 180UL, 206UL, 129UL, 33UL,  71UL,  5UL,   92UL,  182UL, 206UL, 233UL, 56UL,  183UL,
+    68UL,  149UL, 232UL, 16UL,  117UL, 184UL, 182UL, 216UL, 7UL,   28UL,  192UL, 225UL, 125UL, 32UL,  45UL,  132UL,
+    211UL, 190UL, 94UL,  40UL,  245UL, 182UL, 226UL, 77UL,  62UL,  237UL, 60UL,  157UL, 89UL,  184UL, 181UL, 26UL,
+    138UL, 10UL,  154UL, 70UL,  59UL,  249UL, 255UL, 121UL, 37UL,  179UL, 252UL, 20UL,  37UL,  50UL,  59UL,  69UL,
+    183UL, 20UL,  253UL, 75UL,  85UL,  42UL,  154UL, 45UL,  149UL, 58UL,  84UL,  129UL, 82UL,  210UL, 134UL, 81UL,
+    216UL, 37UL,  62UL,  140UL, 232UL, 174UL, 242UL, 28UL,  212UL, 77UL,  12UL,  128UL, 202UL, 62UL,  32UL,  172UL,
+    243UL, 31UL,  228UL, 64UL,  167UL, 65UL,  91UL,  133UL, 56UL,  2UL,   133UL, 1UL,   255UL, 181UL, 149UL, 206UL,
+    167UL, 25UL,  34UL,  124UL, 242UL, 23UL,  237UL, 88UL,  1UL,   37UL,  132UL, 238UL, 229UL, 239UL, 66UL,  107UL,
+    253UL, 58UL,  233UL, 190UL, 13UL,  77UL,  204UL, 188UL, 237UL, 14UL,  103UL, 158UL, 127UL, 206UL, 70UL,  33UL,
+    158UL, 62UL,  208UL, 250UL, 7UL,   199UL, 36UL,  203UL, 134UL, 18UL,  108UL, 61UL,  62UL,  199UL, 248UL, 15UL,
+    86UL,  51UL,  32UL,  74UL,  47UL,  245UL, 131UL, 246UL, 92UL,  45UL,  198UL, 60UL,  48UL,  141UL, 241UL, 220UL,
+    71UL,  172UL, 190UL, 202UL, 61UL,  76UL,  99UL,  206UL, 224UL, 170UL, 73UL,  139UL, 196UL, 228UL, 218UL, 183UL,
+    239UL, 41UL,  140UL, 112UL, 172UL, 49UL,  186UL, 213UL, 232UL, 151UL, 96UL,  43UL,  220UL, 108UL, 214UL, 239UL,
+    163UL, 43UL,  214UL, 64UL,  113UL, 192UL, 64UL,  81UL,  3UL,   198UL, 127UL, 20UL,  133UL, 215UL, 106UL, 34UL,
+    109UL, 97UL,  156UL, 209UL, 187UL, 53UL,  63UL,  2UL,   63UL,  46UL,  141UL, 198UL, 67UL,  60UL,  76UL,  220UL,
+    72UL,  5UL,   177UL, 2UL,   70UL,  10UL,  56UL,  99UL,  62UL,  109UL, 114UL, 48UL,  240UL, 157UL, 247UL, 147UL,
+    13UL,  193UL, 124UL, 56UL,  87UL,  214UL, 76UL,  22UL,  162UL, 155UL, 4UL,   190UL, 157UL, 163UL, 110UL, 15UL,
+    78UL,  51UL,  158UL, 195UL, 20UL,  152UL, 185UL, 9UL,   123UL, 129UL, 45UL,  178UL, 171UL, 26UL,  128UL, 237UL,
+    129UL, 123UL, 185UL, 10UL,  81UL,  202UL, 116UL, 245UL, 233UL, 208UL, 88UL,  2UL,   231UL, 36UL,  153UL, 119UL,
+    254UL, 131UL, 112UL, 187UL, 22UL,  75UL,  14UL,  157UL, 40UL,  105UL, 217UL, 13UL,  146UL, 4UL,   212UL, 39UL,
+    204UL, 3UL,   103UL, 41UL,  231UL, 149UL, 116UL, 104UL, 195UL, 71UL,  46UL,  224UL, 246UL, 13UL,  71UL,  116UL,
+    91UL,  197UL, 79UL,  255UL, 136UL, 196UL, 175UL, 153UL, 74UL,  42UL,  158UL, 11UL,  112UL, 9UL,   202UL, 233UL,
+    199UL, 177UL, 103UL, 177UL, 109UL, 230UL, 31UL,  175UL, 15UL,  102UL, 125UL, 199UL, 149UL, 134UL, 250UL, 62UL,
+    170UL, 40UL,  170UL, 126UL, 12UL,  55UL,  232UL, 92UL,  108UL, 67UL,  36UL,  10UL,  72UL,  52UL,  47UL,  55UL,
+    212UL, 145UL, 169UL, 114UL, 255UL, 176UL, 204UL, 138UL, 247UL, 1UL,   237UL, 102UL, 19UL,  8UL,   244UL, 233UL,
+    126UL, 148UL, 78UL,  20UL,  233UL, 206UL, 197UL, 2UL,   79UL,  238UL, 14UL,  89UL,  151UL, 7UL,   162UL, 200UL,
+    54UL,  221UL, 159UL, 155UL, 124UL, 32UL,  9UL,   134UL, 167UL, 78UL,  131UL, 166UL, 229UL, 147UL, 46UL,  165UL,
+};
+uint32_t rand_arr_9_b9_w32_arr[1024] = {
+    29UL,  452UL, 211UL, 361UL, 71UL,  478UL, 345UL, 445UL, 35UL,  21UL,  457UL, 80UL,  489UL, 136UL, 246UL, 136UL,
+    82UL,  504UL, 421UL, 213UL, 22UL,  335UL, 138UL, 272UL, 361UL, 263UL, 145UL, 242UL, 247UL, 460UL, 118UL, 48UL,
+    415UL, 341UL, 471UL, 49UL,  157UL, 227UL, 141UL, 237UL, 199UL, 420UL, 118UL, 381UL, 373UL, 229UL, 262UL, 507UL,
+    493UL, 386UL, 496UL, 98UL,  468UL, 445UL, 427UL, 348UL, 367UL, 474UL, 1UL,   246UL, 107UL, 180UL, 124UL, 49UL,
+    371UL, 103UL, 84UL,  253UL, 508UL, 406UL, 374UL, 110UL, 414UL, 475UL, 202UL, 67UL,  363UL, 91UL,  259UL, 229UL,
+    486UL, 412UL, 13UL,  96UL,  445UL, 266UL, 75UL,  118UL, 374UL, 191UL, 507UL, 247UL, 276UL, 389UL, 338UL, 61UL,
+    3UL,   387UL, 384UL, 462UL, 227UL, 106UL, 475UL, 448UL, 27UL,  61UL,  165UL, 181UL, 401UL, 210UL, 371UL, 232UL,
+    122UL, 505UL, 286UL, 328UL, 310UL, 141UL, 282UL, 146UL, 510UL, 132UL, 429UL, 23UL,  216UL, 240UL, 213UL, 441UL,
+    502UL, 391UL, 253UL, 248UL, 431UL, 21UL,  437UL, 450UL, 506UL, 130UL, 319UL, 385UL, 70UL,  311UL, 165UL, 331UL,
+    505UL, 174UL, 330UL, 501UL, 118UL, 156UL, 342UL, 251UL, 295UL, 445UL, 163UL, 88UL,  311UL, 454UL, 58UL,  330UL,
+    98UL,  220UL, 445UL, 128UL, 173UL, 23UL,  479UL, 284UL, 0UL,   145UL, 393UL, 183UL, 72UL,  232UL, 310UL, 475UL,
+    77UL,  414UL, 110UL, 124UL, 367UL, 269UL, 187UL, 0UL,   157UL, 319UL, 319UL, 351UL, 289UL, 498UL, 148UL, 389UL,
+    118UL, 224UL, 15UL,  410UL, 288UL, 411UL, 308UL, 248UL, 445UL, 379UL, 82UL,  202UL, 436UL, 158UL, 375UL, 490UL,
+    458UL, 74UL,  287UL, 256UL, 399UL, 453UL, 154UL, 387UL, 504UL, 180UL, 383UL, 0UL,   283UL, 107UL, 387UL, 398UL,
+    144UL, 101UL, 155UL, 431UL, 62UL,  181UL, 184UL, 228UL, 176UL, 127UL, 335UL, 391UL, 214UL, 350UL, 304UL, 461UL,
+    98UL,  27UL,  361UL, 2UL,   119UL, 428UL, 498UL, 462UL, 39UL,  409UL, 427UL, 418UL, 247UL, 108UL, 413UL, 46UL,
+    169UL, 56UL,  186UL, 472UL, 345UL, 303UL, 425UL, 182UL, 83UL,  238UL, 288UL, 479UL, 374UL, 351UL, 450UL, 246UL,
+    154UL, 125UL, 457UL, 53UL,  493UL, 67UL,  506UL, 41UL,  326UL, 130UL, 223UL, 161UL, 57UL,  52UL,  201UL, 445UL,
+    374UL, 109UL, 291UL, 101UL, 227UL, 111UL, 81UL,  176UL, 442UL, 366UL, 297UL, 206UL, 441UL, 54UL,  120UL, 503UL,
+    113UL, 474UL, 319UL, 203UL, 314UL, 324UL, 19UL,  302UL, 21UL,  496UL, 360UL, 479UL, 495UL, 13UL,  70UL,  124UL,
+    257UL, 281UL, 375UL, 509UL, 428UL, 286UL, 83UL,  196UL, 359UL, 261UL, 154UL, 290UL, 168UL, 231UL, 329UL, 177UL,
+    310UL, 180UL, 402UL, 239UL, 104UL, 269UL, 474UL, 33UL,  365UL, 183UL, 510UL, 260UL, 303UL, 324UL, 458UL, 38UL,
+    150UL, 191UL, 301UL, 156UL, 145UL, 447UL, 480UL, 278UL, 350UL, 428UL, 430UL, 170UL, 469UL, 244UL, 231UL, 11UL,
+    274UL, 3UL,   62UL,  484UL, 5UL,   265UL, 239UL, 419UL, 148UL, 11UL,  45UL,  226UL, 390UL, 229UL, 90UL,  7UL,
+    494UL, 338UL, 223UL, 326UL, 401UL, 144UL, 431UL, 216UL, 146UL, 6UL,   355UL, 202UL, 372UL, 371UL, 27UL,  60UL,
+    214UL, 87UL,  359UL, 506UL, 434UL, 510UL, 310UL, 71UL,  190UL, 68UL,  464UL, 268UL, 34UL,  361UL, 238UL, 279UL,
+    495UL, 336UL, 118UL, 145UL, 441UL, 389UL, 371UL, 24UL,  419UL, 170UL, 252UL, 174UL, 291UL, 85UL,  78UL,  30UL,
+    212UL, 155UL, 181UL, 332UL, 125UL, 455UL, 108UL, 49UL,  127UL, 285UL, 423UL, 315UL, 400UL, 273UL, 317UL, 133UL,
+    54UL,  303UL, 116UL, 274UL, 233UL, 495UL, 229UL, 112UL, 381UL, 458UL, 119UL, 424UL, 350UL, 501UL, 478UL, 279UL,
+    107UL, 426UL, 145UL, 366UL, 471UL, 117UL, 444UL, 28UL,  325UL, 129UL, 472UL, 436UL, 41UL,  330UL, 354UL, 162UL,
+    218UL, 461UL, 437UL, 370UL, 416UL, 125UL, 12UL,  327UL, 122UL, 60UL,  432UL, 183UL, 236UL, 227UL, 315UL, 400UL,
+    112UL, 423UL, 392UL, 479UL, 504UL, 450UL, 331UL, 283UL, 290UL, 52UL,  207UL, 329UL, 251UL, 301UL, 9UL,   329UL,
+    128UL, 154UL, 81UL,  57UL,  470UL, 421UL, 128UL, 243UL, 238UL, 85UL,  222UL, 6UL,   347UL, 367UL, 442UL, 450UL,
+    67UL,  359UL, 409UL, 64UL,  391UL, 182UL, 409UL, 321UL, 73UL,  308UL, 508UL, 384UL, 266UL, 327UL, 160UL, 175UL,
+    408UL, 442UL, 173UL, 88UL,  403UL, 46UL,  324UL, 231UL, 185UL, 137UL, 440UL, 490UL, 510UL, 49UL,  262UL, 286UL,
+    219UL, 283UL, 68UL,  328UL, 330UL, 242UL, 17UL,  337UL, 395UL, 150UL, 130UL, 406UL, 460UL, 473UL, 459UL, 165UL,
+    420UL, 499UL, 497UL, 434UL, 411UL, 168UL, 280UL, 79UL,  39UL,  40UL,  353UL, 286UL, 140UL, 419UL, 340UL, 95UL,
+    59UL,  301UL, 447UL, 403UL, 477UL, 289UL, 1UL,   263UL, 128UL, 406UL, 500UL, 195UL, 300UL, 175UL, 364UL, 60UL,
+    469UL, 356UL, 105UL, 85UL,  80UL,  434UL, 477UL, 318UL, 64UL,  393UL, 61UL,  480UL, 3UL,   507UL, 81UL,  432UL,
+    152UL, 368UL, 362UL, 408UL, 92UL,  144UL, 510UL, 281UL, 421UL, 302UL, 258UL, 476UL, 248UL, 404UL, 221UL, 11UL,
+    408UL, 97UL,  318UL, 74UL,  231UL, 209UL, 191UL, 29UL,  32UL,  16UL,  329UL, 97UL,  408UL, 148UL, 210UL, 350UL,
+    102UL, 113UL, 182UL, 359UL, 454UL, 5UL,   105UL, 63UL,  314UL, 22UL,  7UL,   39UL,  217UL, 46UL,  484UL, 318UL,
+    299UL, 76UL,  308UL, 47UL,  363UL, 74UL,  275UL, 229UL, 353UL, 466UL, 77UL,  362UL, 247UL, 14UL,  262UL, 85UL,
+    236UL, 165UL, 91UL,  98UL,  201UL, 127UL, 435UL, 51UL,  53UL,  511UL, 277UL, 162UL, 332UL, 220UL, 165UL, 258UL,
+    468UL, 264UL, 340UL, 315UL, 11UL,  319UL, 148UL, 270UL, 391UL, 369UL, 435UL, 327UL, 43UL,  446UL, 468UL, 404UL,
+    99UL,  309UL, 503UL, 285UL, 448UL, 451UL, 461UL, 36UL,  100UL, 327UL, 432UL, 171UL, 289UL, 90UL,  391UL, 400UL,
+    489UL, 145UL, 337UL, 300UL, 449UL, 450UL, 37UL,  82UL,  166UL, 492UL, 72UL,  469UL, 392UL, 185UL, 109UL, 190UL,
+    409UL, 405UL, 35UL,  88UL,  94UL,  132UL, 55UL,  235UL, 355UL, 398UL, 140UL, 466UL, 142UL, 220UL, 420UL, 184UL,
+    245UL, 190UL, 131UL, 391UL, 67UL,  136UL, 269UL, 92UL,  87UL,  199UL, 210UL, 416UL, 414UL, 334UL, 57UL,  471UL,
+    506UL, 73UL,  456UL, 28UL,  348UL, 118UL, 448UL, 63UL,  213UL, 381UL, 233UL, 342UL, 178UL, 55UL,  20UL,  323UL,
+    478UL, 145UL, 471UL, 119UL, 435UL, 386UL, 368UL, 76UL,  40UL,  469UL, 26UL,  390UL, 144UL, 145UL, 391UL, 189UL,
+    505UL, 271UL, 161UL, 504UL, 394UL, 201UL, 302UL, 187UL, 303UL, 476UL, 369UL, 259UL, 236UL, 400UL, 175UL, 315UL,
+    415UL, 86UL,  297UL, 294UL, 132UL, 329UL, 87UL,  341UL, 174UL, 260UL, 157UL, 65UL,  511UL, 139UL, 179UL, 317UL,
+    442UL, 495UL, 437UL, 102UL, 299UL, 58UL,  504UL, 509UL, 194UL, 250UL, 197UL, 404UL, 433UL, 140UL, 477UL, 398UL,
+    445UL, 218UL, 59UL,  214UL, 383UL, 473UL, 437UL, 427UL, 402UL, 116UL, 291UL, 241UL, 174UL, 396UL, 221UL, 168UL,
+    279UL, 268UL, 8UL,   159UL, 218UL, 380UL, 191UL, 448UL, 442UL, 219UL, 509UL, 448UL, 288UL, 280UL, 129UL, 30UL,
+    129UL, 259UL, 134UL, 278UL, 184UL, 184UL, 254UL, 386UL, 46UL,  60UL,  19UL,  505UL, 170UL, 481UL, 161UL, 249UL,
+    211UL, 297UL, 211UL, 389UL, 33UL,  122UL, 465UL, 493UL, 484UL, 153UL, 316UL, 364UL, 245UL, 162UL, 20UL,  44UL,
+    399UL, 470UL, 275UL, 232UL, 87UL,  53UL,  333UL, 46UL,  362UL, 171UL, 54UL,  177UL, 502UL, 94UL,  195UL, 233UL,
+    85UL,  247UL, 495UL, 84UL,  22UL,  336UL, 150UL, 60UL,  498UL, 412UL, 448UL, 372UL, 270UL, 11UL,  188UL, 200UL,
+    223UL, 255UL, 247UL, 98UL,  243UL, 449UL, 438UL, 17UL,  301UL, 211UL, 13UL,  140UL, 478UL, 493UL, 216UL, 150UL,
+    44UL,  357UL, 55UL,  494UL, 412UL, 167UL, 58UL,  239UL, 214UL, 306UL, 245UL, 270UL, 186UL, 236UL, 124UL, 307UL,
+    494UL, 398UL, 56UL,  406UL, 473UL, 195UL, 289UL, 369UL, 219UL, 238UL, 58UL,  24UL,  437UL, 427UL, 252UL, 471UL,
+    358UL, 211UL, 121UL, 374UL, 244UL, 120UL, 251UL, 437UL, 321UL, 189UL, 413UL, 217UL, 163UL, 374UL, 36UL,  264UL,
+};
+uint32_t rand_arr_10_b10_w32_arr[1024] = {
+    924UL,  338UL,  130UL,  360UL, 628UL,  770UL,  391UL,  104UL,  326UL,  763UL,  162UL,  343UL,  556UL, 764UL,
+    178UL,  670UL,  89UL,   912UL, 914UL,  709UL,  187UL,  646UL,  862UL,  246UL,  24UL,   701UL,  838UL, 422UL,
+    381UL,  768UL,  380UL,  480UL, 929UL,  257UL,  139UL,  660UL,  142UL,  236UL,  234UL,  752UL,  835UL, 902UL,
+    560UL,  719UL,  162UL,  331UL, 554UL,  681UL,  863UL,  159UL,  427UL,  669UL,  341UL,  53UL,   858UL, 661UL,
+    152UL,  364UL,  627UL,  317UL, 683UL,  758UL,  9UL,    669UL,  128UL,  3UL,    869UL,  116UL,  161UL, 946UL,
+    914UL,  207UL,  370UL,  90UL,  22UL,   602UL,  583UL,  37UL,   948UL,  547UL,  608UL,  554UL,  793UL, 843UL,
+    212UL,  163UL,  922UL,  748UL, 610UL,  49UL,   679UL,  360UL,  963UL,  873UL,  193UL,  691UL,  324UL, 965UL,
+    137UL,  494UL,  594UL,  844UL, 899UL,  539UL,  555UL,  443UL,  97UL,   981UL,  84UL,   548UL,  951UL, 48UL,
+    86UL,   770UL,  966UL,  42UL,  155UL,  848UL,  474UL,  817UL,  624UL,  242UL,  538UL,  261UL,  637UL, 104UL,
+    301UL,  1002UL, 504UL,  880UL, 940UL,  740UL,  139UL,  763UL,  25UL,   1010UL, 64UL,   904UL,  710UL, 638UL,
+    353UL,  832UL,  977UL,  901UL, 751UL,  266UL,  632UL,  927UL,  387UL,  485UL,  666UL,  207UL,  690UL, 185UL,
+    857UL,  559UL,  1UL,    363UL, 839UL,  732UL,  650UL,  606UL,  248UL,  874UL,  979UL,  732UL,  746UL, 694UL,
+    460UL,  795UL,  348UL,  865UL, 36UL,   270UL,  953UL,  684UL,  290UL,  850UL,  953UL,  1000UL, 102UL, 797UL,
+    1000UL, 915UL,  601UL,  912UL, 228UL,  108UL,  941UL,  209UL,  407UL,  535UL,  39UL,   894UL,  62UL,  838UL,
+    604UL,  712UL,  674UL,  816UL, 525UL,  657UL,  620UL,  275UL,  415UL,  18UL,   270UL,  763UL,  71UL,  481UL,
+    452UL,  750UL,  293UL,  432UL, 71UL,   986UL,  315UL,  153UL,  213UL,  631UL,  756UL,  655UL,  601UL, 18UL,
+    683UL,  84UL,   391UL,  232UL, 878UL,  97UL,   143UL,  128UL,  1022UL, 928UL,  391UL,  362UL,  581UL, 43UL,
+    296UL,  966UL,  190UL,  274UL, 297UL,  634UL,  164UL,  234UL,  196UL,  808UL,  983UL,  586UL,  697UL, 784UL,
+    50UL,   138UL,  71UL,   397UL, 929UL,  733UL,  479UL,  843UL,  150UL,  523UL,  809UL,  543UL,  949UL, 713UL,
+    269UL,  733UL,  617UL,  732UL, 596UL,  97UL,   224UL,  829UL,  699UL,  895UL,  477UL,  757UL,  768UL, 665UL,
+    297UL,  504UL,  262UL,  151UL, 436UL,  781UL,  553UL,  160UL,  894UL,  919UL,  879UL,  813UL,  889UL, 860UL,
+    420UL,  807UL,  250UL,  525UL, 596UL,  984UL,  456UL,  897UL,  736UL,  776UL,  704UL,  896UL,  413UL, 371UL,
+    533UL,  495UL,  333UL,  896UL, 674UL,  577UL,  515UL,  922UL,  939UL,  508UL,  925UL,  415UL,  854UL, 66UL,
+    124UL,  265UL,  395UL,  477UL, 155UL,  158UL,  874UL,  81UL,   21UL,   629UL,  868UL,  1005UL, 65UL,  926UL,
+    765UL,  647UL,  884UL,  345UL, 880UL,  364UL,  430UL,  756UL,  52UL,   810UL,  568UL,  227UL,  561UL, 63UL,
+    389UL,  728UL,  284UL,  946UL, 207UL,  580UL,  339UL,  918UL,  962UL,  879UL,  899UL,  237UL,  44UL,  624UL,
+    242UL,  994UL,  114UL,  948UL, 901UL,  40UL,   706UL,  71UL,   345UL,  180UL,  560UL,  1015UL, 123UL, 689UL,
+    576UL,  598UL,  247UL,  810UL, 791UL,  276UL,  646UL,  103UL,  746UL,  374UL,  1012UL, 439UL,  689UL, 1003UL,
+    34UL,   33UL,   413UL,  807UL, 812UL,  324UL,  329UL,  698UL,  695UL,  790UL,  856UL,  291UL,  260UL, 570UL,
+    373UL,  406UL,  699UL,  113UL, 266UL,  622UL,  96UL,   769UL,  51UL,   129UL,  982UL,  744UL,  217UL, 155UL,
+    169UL,  441UL,  516UL,  920UL, 309UL,  356UL,  798UL,  912UL,  91UL,   437UL,  15UL,   568UL,  664UL, 270UL,
+    784UL,  157UL,  668UL,  249UL, 36UL,   511UL,  508UL,  480UL,  990UL,  399UL,  163UL,  961UL,  491UL, 373UL,
+    428UL,  704UL,  561UL,  418UL, 977UL,  184UL,  857UL,  1017UL, 879UL,  377UL,  438UL,  560UL,  587UL, 655UL,
+    232UL,  141UL,  425UL,  819UL, 540UL,  688UL,  562UL,  67UL,   561UL,  529UL,  720UL,  870UL,  442UL, 76UL,
+    680UL,  94UL,   759UL,  759UL, 686UL,  51UL,   472UL,  122UL,  982UL,  642UL,  898UL,  583UL,  940UL, 192UL,
+    531UL,  758UL,  840UL,  684UL, 649UL,  464UL,  591UL,  862UL,  918UL,  971UL,  587UL,  523UL,  173UL, 720UL,
+    198UL,  86UL,   538UL,  694UL, 904UL,  294UL,  502UL,  521UL,  990UL,  789UL,  856UL,  103UL,  47UL,  908UL,
+    491UL,  112UL,  205UL,  645UL, 517UL,  806UL,  892UL,  740UL,  447UL,  80UL,   466UL,  957UL,  887UL, 439UL,
+    126UL,  124UL,  25UL,   433UL, 544UL,  963UL,  299UL,  276UL,  652UL,  594UL,  912UL,  1023UL, 631UL, 484UL,
+    250UL,  293UL,  449UL,  782UL, 294UL,  368UL,  439UL,  181UL,  373UL,  1021UL, 623UL,  918UL,  623UL, 584UL,
+    963UL,  373UL,  12UL,   735UL, 83UL,   834UL,  612UL,  692UL,  368UL,  777UL,  7UL,    915UL,  640UL, 457UL,
+    136UL,  838UL,  413UL,  856UL, 227UL,  152UL,  231UL,  376UL,  850UL,  268UL,  667UL,  902UL,  60UL,  644UL,
+    773UL,  495UL,  381UL,  24UL,  390UL,  202UL,  14UL,   627UL,  1021UL, 256UL,  651UL,  456UL,  879UL, 71UL,
+    648UL,  4UL,    824UL,  990UL, 807UL,  774UL,  687UL,  668UL,  485UL,  333UL,  206UL,  237UL,  171UL, 855UL,
+    131UL,  547UL,  140UL,  823UL, 630UL,  9UL,    44UL,   7UL,    889UL,  918UL,  505UL,  159UL,  837UL, 127UL,
+    534UL,  202UL,  458UL,  130UL, 434UL,  710UL,  874UL,  816UL,  8UL,    968UL,  583UL,  672UL,  170UL, 858UL,
+    500UL,  237UL,  342UL,  510UL, 33UL,   679UL,  170UL,  532UL,  168UL,  762UL,  214UL,  1004UL, 667UL, 871UL,
+    948UL,  394UL,  44UL,   700UL, 656UL,  102UL,  25UL,   945UL,  528UL,  651UL,  979UL,  668UL,  443UL, 97UL,
+    777UL,  1008UL, 359UL,  209UL, 411UL,  643UL,  492UL,  861UL,  360UL,  385UL,  902UL,  819UL,  504UL, 801UL,
+    862UL,  808UL,  827UL,  27UL,  384UL,  465UL,  927UL,  529UL,  787UL,  577UL,  918UL,  897UL,  341UL, 369UL,
+    717UL,  101UL,  1011UL, 392UL, 652UL,  601UL,  203UL,  414UL,  71UL,   441UL,  554UL,  349UL,  171UL, 1023UL,
+    539UL,  25UL,   52UL,   302UL, 760UL,  1021UL, 365UL,  728UL,  321UL,  709UL,  221UL,  582UL,  284UL, 999UL,
+    40UL,   217UL,  350UL,  268UL, 518UL,  781UL,  14UL,   523UL,  300UL,  913UL,  777UL,  382UL,  247UL, 140UL,
+    986UL,  115UL,  256UL,  124UL, 853UL,  727UL,  702UL,  938UL,  31UL,   21UL,   789UL,  547UL,  19UL,  142UL,
+    17UL,   124UL,  717UL,  664UL, 976UL,  667UL,  1001UL, 626UL,  240UL,  431UL,  390UL,  393UL,  425UL, 914UL,
+    645UL,  807UL,  985UL,  855UL, 490UL,  167UL,  984UL,  661UL,  376UL,  787UL,  893UL,  70UL,   295UL, 643UL,
+    51UL,   930UL,  963UL,  114UL, 337UL,  895UL,  802UL,  403UL,  340UL,  103UL,  894UL,  784UL,  221UL, 160UL,
+    192UL,  408UL,  480UL,  836UL, 414UL,  520UL,  474UL,  613UL,  648UL,  937UL,  313UL,  821UL,  963UL, 599UL,
+    385UL,  540UL,  830UL,  502UL, 29UL,   416UL,  559UL,  802UL,  734UL,  132UL,  890UL,  45UL,   434UL, 935UL,
+    148UL,  18UL,   853UL,  152UL, 906UL,  237UL,  447UL,  877UL,  1UL,    413UL,  966UL,  348UL,  311UL, 749UL,
+    533UL,  18UL,   930UL,  648UL, 29UL,   406UL,  500UL,  716UL,  424UL,  763UL,  521UL,  76UL,   377UL, 378UL,
+    192UL,  277UL,  881UL,  989UL, 975UL,  922UL,  588UL,  755UL,  916UL,  406UL,  971UL,  743UL,  521UL, 537UL,
+    147UL,  1019UL, 60UL,   213UL, 1022UL, 984UL,  312UL,  937UL,  957UL,  635UL,  503UL,  697UL,  996UL, 691UL,
+    107UL,  743UL,  111UL,  24UL,  239UL,  244UL,  547UL,  923UL,  81UL,   273UL,  750UL,  637UL,  961UL, 84UL,
+    130UL,  90UL,   744UL,  477UL, 452UL,  360UL,  185UL,  740UL,  986UL,  130UL,  183UL,  812UL,  67UL,  1008UL,
+    641UL,  970UL,  1006UL, 92UL,  175UL,  615UL,  46UL,   420UL,  559UL,  252UL,  705UL,  33UL,   721UL, 854UL,
+    288UL,  479UL,  783UL,  827UL, 301UL,  18UL,   253UL,  268UL,  158UL,  461UL,  865UL,  931UL,  899UL, 850UL,
+    676UL,  912UL,  468UL,  293UL, 881UL,  76UL,   229UL,  662UL,  947UL,  150UL,  47UL,   354UL,  676UL, 879UL,
+    707UL,  1017UL, 114UL,  584UL, 589UL,  433UL,  569UL,  335UL,  473UL,  755UL,  376UL,  496UL,  545UL, 559UL,
+    639UL,  601UL,  917UL,  912UL, 281UL,  334UL,  8UL,    923UL,  93UL,   903UL,  395UL,  819UL,  55UL,  640UL,
+    585UL,  501UL,  205UL,  300UL, 105UL,  913UL,  844UL,  265UL,  384UL,  496UL,  714UL,  708UL,  655UL, 331UL,
+    386UL,  843UL,  952UL,  215UL, 699UL,  395UL,  835UL,  366UL,  379UL,  866UL,  1019UL, 0UL,    753UL, 983UL,
+    841UL,  739UL,  133UL,  381UL, 472UL,  410UL,  557UL,  575UL,  189UL,  855UL,  553UL,  463UL,  928UL, 1005UL,
+    729UL,  149UL,
+};
+uint32_t rand_arr_11_b11_w32_arr[1024] = {
+    72UL,   1028UL, 514UL,  301UL,  1944UL, 372UL,  627UL,  1034UL, 174UL,  1546UL, 62UL,   82UL,   1835UL, 1749UL,
+    1503UL, 726UL,  597UL,  1630UL, 102UL,  1181UL, 1474UL, 1953UL, 146UL,  1966UL, 1496UL, 1819UL, 1933UL, 997UL,
+    651UL,  1636UL, 1871UL, 57UL,   601UL,  1935UL, 1288UL, 173UL,  1617UL, 1696UL, 72UL,   480UL,  1095UL, 1200UL,
+    860UL,  541UL,  1143UL, 1962UL, 229UL,  523UL,  1745UL, 808UL,  134UL,  797UL,  283UL,  2029UL, 743UL,  1790UL,
+    1176UL, 350UL,  1809UL, 565UL,  1545UL, 26UL,   1589UL, 161UL,  1993UL, 1257UL, 817UL,  1558UL, 250UL,  307UL,
+    828UL,  1300UL, 706UL,  596UL,  2045UL, 1972UL, 951UL,  1472UL, 193UL,  47UL,   1069UL, 90UL,   760UL,  336UL,
+    1303UL, 1280UL, 1883UL, 102UL,  1609UL, 656UL,  1119UL, 1569UL, 624UL,  1038UL, 1520UL, 1778UL, 804UL,  1464UL,
+    1600UL, 1829UL, 1658UL, 1691UL, 190UL,  1470UL, 1837UL, 386UL,  1261UL, 445UL,  621UL,  1184UL, 1124UL, 734UL,
+    878UL,  1865UL, 1320UL, 1942UL, 1458UL, 301UL,  1840UL, 538UL,  869UL,  1310UL, 519UL,  1255UL, 887UL,  919UL,
+    280UL,  1634UL, 196UL,  621UL,  347UL,  719UL,  1926UL, 1983UL, 1262UL, 933UL,  1899UL, 720UL,  1657UL, 494UL,
+    36UL,   1710UL, 321UL,  897UL,  1744UL, 543UL,  1048UL, 1158UL, 727UL,  144UL,  343UL,  1272UL, 798UL,  171UL,
+    509UL,  810UL,  1825UL, 80UL,   1757UL, 862UL,  1416UL, 1960UL, 1737UL, 1346UL, 1600UL, 603UL,  126UL,  310UL,
+    1210UL, 122UL,  48UL,   210UL,  219UL,  1422UL, 1979UL, 1550UL, 1689UL, 1462UL, 1630UL, 1676UL, 364UL,  1007UL,
+    1694UL, 221UL,  872UL,  1952UL, 1277UL, 578UL,  480UL,  1002UL, 1075UL, 1534UL, 417UL,  28UL,   792UL,  1812UL,
+    1426UL, 1270UL, 1957UL, 943UL,  1650UL, 263UL,  771UL,  1539UL, 614UL,  1044UL, 1477UL, 630UL,  183UL,  113UL,
+    29UL,   1303UL, 275UL,  34UL,   1822UL, 907UL,  930UL,  2009UL, 152UL,  336UL,  388UL,  1153UL, 211UL,  1255UL,
+    33UL,   1149UL, 580UL,  1707UL, 972UL,  513UL,  1704UL, 1470UL, 404UL,  1371UL, 1927UL, 810UL,  777UL,  934UL,
+    1773UL, 1311UL, 408UL,  1596UL, 1116UL, 1297UL, 1682UL, 196UL,  1048UL, 163UL,  1830UL, 1007UL, 737UL,  243UL,
+    819UL,  205UL,  1261UL, 1421UL, 560UL,  1910UL, 23UL,   1493UL, 1204UL, 1424UL, 119UL,  241UL,  1221UL, 642UL,
+    409UL,  223UL,  690UL,  1168UL, 1976UL, 500UL,  859UL,  1177UL, 659UL,  1475UL, 1671UL, 1721UL, 406UL,  1845UL,
+    1643UL, 929UL,  1576UL, 1247UL, 954UL,  710UL,  1513UL, 1900UL, 443UL,  1200UL, 682UL,  1696UL, 400UL,  952UL,
+    486UL,  1945UL, 1838UL, 1587UL, 1786UL, 1473UL, 774UL,  986UL,  594UL,  1871UL, 925UL,  545UL,  951UL,  1280UL,
+    485UL,  1167UL, 1300UL, 1573UL, 611UL,  1003UL, 691UL,  1736UL, 744UL,  989UL,  979UL,  1411UL, 1808UL, 1152UL,
+    1684UL, 1725UL, 1729UL, 1216UL, 807UL,  1995UL, 397UL,  257UL,  901UL,  911UL,  1895UL, 1511UL, 431UL,  1246UL,
+    13UL,   0UL,    709UL,  1344UL, 144UL,  409UL,  766UL,  461UL,  918UL,  174UL,  201UL,  360UL,  581UL,  907UL,
+    2045UL, 3UL,    56UL,   450UL,  1254UL, 1263UL, 61UL,   1430UL, 532UL,  408UL,  409UL,  1986UL, 1346UL, 2029UL,
+    788UL,  438UL,  585UL,  442UL,  1967UL, 1239UL, 636UL,  1935UL, 952UL,  948UL,  962UL,  1346UL, 365UL,  529UL,
+    147UL,  1591UL, 1995UL, 453UL,  1051UL, 919UL,  1118UL, 58UL,   1636UL, 2037UL, 194UL,  1199UL, 1757UL, 1318UL,
+    454UL,  324UL,  639UL,  764UL,  1001UL, 442UL,  508UL,  1791UL, 678UL,  15UL,   898UL,  138UL,  1419UL, 1819UL,
+    1190UL, 1330UL, 183UL,  575UL,  239UL,  1374UL, 1730UL, 1723UL, 1933UL, 81UL,   1880UL, 1665UL, 1659UL, 398UL,
+    301UL,  915UL,  1080UL, 39UL,   1431UL, 1824UL, 1723UL, 1122UL, 1773UL, 1341UL, 1729UL, 565UL,  1882UL, 301UL,
+    1375UL, 1170UL, 1416UL, 1450UL, 1185UL, 1090UL, 427UL,  503UL,  1446UL, 498UL,  1226UL, 642UL,  1505UL, 1063UL,
+    946UL,  760UL,  624UL,  1835UL, 888UL,  1885UL, 14UL,   138UL,  1968UL, 1156UL, 1896UL, 1392UL, 243UL,  2021UL,
+    127UL,  1709UL, 1083UL, 998UL,  1925UL, 1609UL, 1586UL, 702UL,  1633UL, 473UL,  1459UL, 465UL,  1587UL, 1902UL,
+    950UL,  1805UL, 198UL,  1226UL, 1694UL, 940UL,  1889UL, 1484UL, 1043UL, 1361UL, 2047UL, 513UL,  407UL,  1048UL,
+    1820UL, 912UL,  707UL,  1222UL, 1430UL, 1906UL, 7UL,    1634UL, 461UL,  1914UL, 813UL,  1517UL, 1022UL, 22UL,
+    1061UL, 1324UL, 1382UL, 50UL,   1088UL, 6UL,    1596UL, 87UL,   419UL,  180UL,  36UL,   1988UL, 1240UL, 378UL,
+    84UL,   1638UL, 758UL,  1070UL, 267UL,  932UL,  1862UL, 1291UL, 1977UL, 802UL,  1617UL, 1030UL, 691UL,  750UL,
+    577UL,  1011UL, 1330UL, 1900UL, 1876UL, 958UL,  333UL,  879UL,  1401UL, 324UL,  706UL,  273UL,  774UL,  977UL,
+    480UL,  558UL,  1630UL, 1027UL, 1926UL, 1942UL, 1219UL, 1621UL, 1606UL, 291UL,  2020UL, 687UL,  1631UL, 412UL,
+    201UL,  1774UL, 686UL,  897UL,  656UL,  1755UL, 192UL,  774UL,  169UL,  835UL,  1645UL, 1614UL, 935UL,  4UL,
+    337UL,  2019UL, 714UL,  97UL,   735UL,  1173UL, 514UL,  1658UL, 967UL,  651UL,  501UL,  421UL,  142UL,  1645UL,
+    184UL,  1999UL, 146UL,  574UL,  1738UL, 199UL,  352UL,  1810UL, 1300UL, 1171UL, 1726UL, 1080UL, 1332UL, 1956UL,
+    714UL,  1680UL, 1613UL, 2021UL, 523UL,  481UL,  1269UL, 653UL,  76UL,   434UL,  266UL,  789UL,  211UL,  122UL,
+    1353UL, 1693UL, 1420UL, 1752UL, 340UL,  1195UL, 680UL,  294UL,  1659UL, 1308UL, 787UL,  995UL,  99UL,   631UL,
+    1273UL, 1154UL, 1560UL, 1858UL, 723UL,  1762UL, 451UL,  1088UL, 1908UL, 976UL,  1322UL, 1283UL, 1235UL, 672UL,
+    975UL,  901UL,  845UL,  1424UL, 1754UL, 1813UL, 1863UL, 1960UL, 1838UL, 1959UL, 1289UL, 655UL,  1138UL, 115UL,
+    394UL,  116UL,  1315UL, 1246UL, 1545UL, 834UL,  763UL,  631UL,  1476UL, 352UL,  442UL,  293UL,  1008UL, 1942UL,
+    1826UL, 32UL,   872UL,  1507UL, 878UL,  714UL,  1961UL, 1696UL, 1240UL, 1081UL, 395UL,  964UL,  1982UL, 305UL,
+    1900UL, 1901UL, 1568UL, 670UL,  698UL,  474UL,  238UL,  888UL,  1127UL, 474UL,  1813UL, 1804UL, 1105UL, 728UL,
+    1820UL, 1522UL, 104UL,  1122UL, 1642UL, 339UL,  538UL,  1601UL, 56UL,   182UL,  799UL,  304UL,  1528UL, 1250UL,
+    1398UL, 67UL,   808UL,  791UL,  140UL,  1903UL, 1859UL, 1238UL, 610UL,  1627UL, 976UL,  1468UL, 409UL,  1160UL,
+    350UL,  381UL,  1812UL, 225UL,  637UL,  884UL,  1132UL, 217UL,  1207UL, 856UL,  1828UL, 881UL,  194UL,  2034UL,
+    520UL,  936UL,  1509UL, 1727UL, 539UL,  792UL,  940UL,  322UL,  1566UL, 702UL,  1901UL, 1631UL, 1092UL, 665UL,
+    606UL,  829UL,  1216UL, 1717UL, 407UL,  1153UL, 2012UL, 1538UL, 289UL,  1666UL, 1214UL, 243UL,  634UL,  593UL,
+    372UL,  557UL,  140UL,  1892UL, 1716UL, 290UL,  1730UL, 1276UL, 409UL,  1132UL, 1940UL, 797UL,  803UL,  2046UL,
+    2032UL, 185UL,  1900UL, 975UL,  1333UL, 451UL,  55UL,   575UL,  2044UL, 991UL,  1077UL, 73UL,   1202UL, 822UL,
+    1318UL, 1685UL, 1159UL, 1285UL, 49UL,   861UL,  921UL,  2029UL, 1737UL, 1580UL, 123UL,  350UL,  283UL,  1369UL,
+    927UL,  1020UL, 1908UL, 207UL,  177UL,  1118UL, 1791UL, 548UL,  557UL,  452UL,  1209UL, 1129UL, 385UL,  1368UL,
+    275UL,  450UL,  316UL,  545UL,  1240UL, 958UL,  1224UL, 467UL,  1971UL, 1252UL, 162UL,  244UL,  416UL,  1334UL,
+    1920UL, 1897UL, 468UL,  1571UL, 1850UL, 684UL,  1783UL, 243UL,  486UL,  1719UL, 458UL,  56UL,   1318UL, 483UL,
+    448UL,  1527UL, 646UL,  1628UL, 1853UL, 1626UL, 14UL,   1821UL, 1911UL, 1686UL, 1203UL, 4UL,    1605UL, 786UL,
+    665UL,  1789UL, 236UL,  1684UL, 285UL,  1621UL, 532UL,  791UL,  382UL,  1989UL, 1796UL, 53UL,   771UL,  454UL,
+    1398UL, 606UL,  2022UL, 1541UL, 850UL,  1821UL, 127UL,  506UL,  355UL,  1510UL, 1947UL, 395UL,  277UL,  81UL,
+    563UL,  1206UL, 1425UL, 631UL,  1598UL, 572UL,  925UL,  1637UL, 180UL,  502UL,  1267UL, 767UL,  2009UL, 955UL,
+    1889UL, 74UL,   948UL,  307UL,  1899UL, 488UL,  1140UL, 857UL,  965UL,  1992UL, 1273UL, 167UL,  2038UL, 118UL,
+    850UL,  545UL,  1795UL, 397UL,  22UL,   706UL,  1611UL, 1609UL, 1112UL, 572UL,  1938UL, 1864UL, 838UL,  2046UL,
+    1767UL, 321UL,  39UL,   555UL,  1992UL, 537UL,  486UL,  1345UL, 1626UL, 907UL,  1422UL, 1590UL, 1315UL, 754UL,
+    691UL,  753UL,  1847UL, 73UL,   1849UL, 1787UL, 1056UL, 377UL,  1630UL, 38UL,   460UL,  978UL,  1150UL, 659UL,
+    1841UL, 715UL,  1422UL, 493UL,  1301UL, 1029UL, 1387UL, 437UL,  1043UL, 1116UL, 54UL,   1590UL, 1313UL, 1141UL,
+    962UL,  1544UL, 178UL,  15UL,   1628UL, 471UL,  1750UL, 1959UL, 807UL,  269UL,  1593UL, 503UL,  165UL,  1066UL,
+    1137UL, 988UL,  586UL,  1365UL, 1974UL, 1673UL, 235UL,  942UL,  32UL,   1493UL, 463UL,  1279UL, 1441UL, 698UL,
+    761UL,  405UL,  163UL,  1234UL, 447UL,  795UL,  454UL,  118UL,  1513UL, 1780UL, 26UL,   1173UL, 299UL,  1439UL,
+    1885UL, 1550UL,
+};
+uint32_t rand_arr_12_b12_w32_arr[1024] = {
+    942UL,  917UL,  3982UL, 3540UL, 3841UL, 497UL,  511UL,  3694UL, 3827UL, 1026UL, 2571UL, 2422UL, 2757UL, 1834UL,
+    59UL,   1685UL, 1718UL, 2598UL, 2204UL, 835UL,  3819UL, 642UL,  2454UL, 585UL,  1515UL, 220UL,  2806UL, 11UL,
+    2625UL, 609UL,  2129UL, 826UL,  3551UL, 3264UL, 4047UL, 376UL,  1780UL, 274UL,  3194UL, 175UL,  1443UL, 1026UL,
+    3790UL, 3207UL, 1581UL, 2112UL, 2987UL, 2235UL, 2199UL, 3341UL, 3981UL, 1271UL, 2034UL, 1542UL, 2953UL, 903UL,
+    2899UL, 657UL,  756UL,  829UL,  1348UL, 3930UL, 263UL,  2902UL, 929UL,  39UL,   2290UL, 2403UL, 3980UL, 1947UL,
+    2546UL, 1923UL, 1515UL, 2757UL, 3531UL, 2047UL, 2140UL, 99UL,   2636UL, 3165UL, 1607UL, 292UL,  478UL,  2319UL,
+    721UL,  2591UL, 3896UL, 2172UL, 2225UL, 361UL,  2998UL, 800UL,  3767UL, 900UL,  2626UL, 2277UL, 734UL,  359UL,
+    1375UL, 2960UL, 1253UL, 1190UL, 3781UL, 2382UL, 2328UL, 3078UL, 2904UL, 2888UL, 2825UL, 1863UL, 1199UL, 108UL,
+    3083UL, 3044UL, 2566UL, 740UL,  757UL,  828UL,  2311UL, 3453UL, 1772UL, 2907UL, 698UL,  3704UL, 950UL,  2628UL,
+    2517UL, 2693UL, 400UL,  30UL,   2937UL, 793UL,  57UL,   485UL,  3623UL, 1791UL, 3680UL, 1317UL, 2678UL, 3303UL,
+    3075UL, 2800UL, 3383UL, 874UL,  788UL,  2494UL, 2255UL, 2805UL, 1745UL, 1363UL, 443UL,  3224UL, 3326UL, 1725UL,
+    661UL,  1890UL, 1803UL, 2293UL, 3749UL, 1854UL, 3166UL, 1487UL, 1531UL, 4082UL, 3088UL, 2207UL, 3879UL, 3253UL,
+    1352UL, 1409UL, 1973UL, 3235UL, 3330UL, 3951UL, 3957UL, 3596UL, 223UL,  2759UL, 840UL,  2470UL, 1380UL, 849UL,
+    155UL,  2267UL, 3398UL, 3002UL, 558UL,  3999UL, 3862UL, 3757UL, 2490UL, 4031UL, 248UL,  787UL,  1326UL, 2990UL,
+    2575UL, 2066UL, 4003UL, 2065UL, 172UL,  3314UL, 1268UL, 698UL,  2440UL, 610UL,  155UL,  114UL,  3253UL, 408UL,
+    1891UL, 2612UL, 1179UL, 1442UL, 4015UL, 3302UL, 3476UL, 3278UL, 1583UL, 1027UL, 2421UL, 934UL,  3592UL, 1728UL,
+    2800UL, 691UL,  882UL,  819UL,  1997UL, 3734UL, 590UL,  12UL,   1942UL, 2879UL, 2480UL, 1405UL, 987UL,  3647UL,
+    392UL,  3783UL, 3135UL, 121UL,  1546UL, 3563UL, 306UL,  711UL,  2555UL, 2639UL, 1608UL, 2507UL, 2949UL, 1918UL,
+    2632UL, 2562UL, 624UL,  2183UL, 185UL,  347UL,  1879UL, 2783UL, 3813UL, 3080UL, 542UL,  1513UL, 1146UL, 1302UL,
+    3412UL, 881UL,  3777UL, 3848UL, 2152UL, 1005UL, 325UL,  2616UL, 998UL,  2208UL, 2466UL, 2764UL, 1804UL, 3272UL,
+    632UL,  3790UL, 3922UL, 1538UL, 2590UL, 2373UL, 3617UL, 2607UL, 2564UL, 3702UL, 2265UL, 479UL,  2478UL, 1781UL,
+    3429UL, 415UL,  1482UL, 1888UL, 3722UL, 3776UL, 1986UL, 2295UL, 1572UL, 211UL,  3207UL, 1848UL, 4094UL, 580UL,
+    518UL,  758UL,  2762UL, 91UL,   32UL,   2876UL, 286UL,  988UL,  1375UL, 1953UL, 1690UL, 494UL,  2260UL, 1710UL,
+    2450UL, 757UL,  396UL,  828UL,  2061UL, 3050UL, 3702UL, 1188UL, 3043UL, 3396UL, 850UL,  2136UL, 103UL,  2791UL,
+    2207UL, 505UL,  1166UL, 2534UL, 937UL,  900UL,  81UL,   2180UL, 2401UL, 3358UL, 755UL,  750UL,  667UL,  1065UL,
+    2234UL, 161UL,  66UL,   394UL,  280UL,  2701UL, 3846UL, 1486UL, 2636UL, 125UL,  2841UL, 2365UL, 1183UL, 2281UL,
+    1729UL, 412UL,  1478UL, 391UL,  1900UL, 265UL,  2798UL, 765UL,  1726UL, 3614UL, 3569UL, 479UL,  1433UL, 2062UL,
+    2898UL, 3948UL, 2055UL, 3227UL, 3366UL, 563UL,  2039UL, 2338UL, 1534UL, 3016UL, 2784UL, 2907UL, 807UL,  710UL,
+    3330UL, 470UL,  3776UL, 1043UL, 299UL,  81UL,   3607UL, 1673UL, 3069UL, 1282UL, 596UL,  1905UL, 1715UL, 3179UL,
+    2885UL, 2629UL, 1311UL, 3086UL, 3098UL, 1176UL, 874UL,  915UL,  2528UL, 266UL,  3632UL, 3047UL, 1435UL, 453UL,
+    2798UL, 4078UL, 785UL,  1673UL, 3072UL, 3447UL, 2131UL, 1928UL, 2657UL, 131UL,  1072UL, 2336UL, 2517UL, 2406UL,
+    1831UL, 1377UL, 3295UL, 206UL,  3289UL, 1152UL, 993UL,  2229UL, 1481UL, 508UL,  2582UL, 463UL,  86UL,   1154UL,
+    537UL,  2988UL, 1445UL, 1681UL, 792UL,  2954UL, 806UL,  3665UL, 2515UL, 2287UL, 1364UL, 3631UL, 413UL,  3728UL,
+    3617UL, 3952UL, 1666UL, 3877UL, 1473UL, 755UL,  3224UL, 2826UL, 1745UL, 2561UL, 3101UL, 125UL,  893UL,  2464UL,
+    3308UL, 1816UL, 122UL,  763UL,  3939UL, 1350UL, 2470UL, 1535UL, 3824UL, 2743UL, 3870UL, 3413UL, 3165UL, 2689UL,
+    4066UL, 3382UL, 1936UL, 3304UL, 1551UL, 405UL,  2264UL, 476UL,  1113UL, 2719UL, 3587UL, 3439UL, 3993UL, 3584UL,
+    1816UL, 1608UL, 1648UL, 2483UL, 3284UL, 1702UL, 3676UL, 786UL,  4054UL, 1692UL, 1635UL, 379UL,  3969UL, 3675UL,
+    1930UL, 2081UL, 392UL,  1898UL, 1473UL, 3125UL, 946UL,  428UL,  3996UL, 1300UL, 359UL,  3947UL, 2069UL, 2258UL,
+    452UL,  1782UL, 3268UL, 277UL,  2300UL, 369UL,  2205UL, 3207UL, 2492UL, 3097UL, 1930UL, 1460UL, 2212UL, 330UL,
+    1447UL, 895UL,  2712UL, 355UL,  260UL,  1028UL, 2781UL, 2983UL, 2486UL, 3487UL, 1851UL, 3072UL, 239UL,  3642UL,
+    753UL,  1270UL, 2909UL, 324UL,  3932UL, 1569UL, 2368UL, 3004UL, 3626UL, 2813UL, 1221UL, 1514UL, 441UL,  3268UL,
+    2307UL, 2461UL, 1645UL, 2934UL, 3760UL, 1293UL, 2047UL, 2806UL, 2686UL, 1829UL, 907UL,  2037UL, 1856UL, 208UL,
+    4056UL, 1321UL, 1861UL, 2587UL, 3289UL, 1853UL, 2269UL, 1630UL, 82UL,   1478UL, 1992UL, 2540UL, 2816UL, 608UL,
+    2417UL, 1324UL, 802UL,  2308UL, 803UL,  18UL,   1144UL, 206UL,  795UL,  866UL,  1286UL, 3665UL, 3822UL, 3865UL,
+    1106UL, 2365UL, 1514UL, 2968UL, 1322UL, 2751UL, 759UL,  1346UL, 107UL,  3142UL, 99UL,   2682UL, 331UL,  265UL,
+    3192UL, 2551UL, 1921UL, 1411UL, 2578UL, 831UL,  2694UL, 1981UL, 2385UL, 2827UL, 999UL,  940UL,  891UL,  1869UL,
+    4008UL, 1005UL, 2461UL, 2304UL, 3074UL, 1551UL, 1115UL, 1549UL, 453UL,  438UL,  3993UL, 2618UL, 3840UL, 1129UL,
+    324UL,  3117UL, 16UL,   1789UL, 3885UL, 302UL,  449UL,  2387UL, 1961UL, 3599UL, 2474UL, 1920UL, 28UL,   3091UL,
+    1180UL, 3795UL, 3803UL, 1981UL, 2424UL, 23UL,   1743UL, 2969UL, 547UL,  3340UL, 987UL,  3610UL, 3579UL, 276UL,
+    339UL,  2104UL, 253UL,  2812UL, 2491UL, 1837UL, 59UL,   2814UL, 3630UL, 360UL,  1775UL, 2002UL, 1718UL, 3574UL,
+    1965UL, 1899UL, 1876UL, 738UL,  51UL,   2778UL, 2668UL, 1430UL, 2526UL, 737UL,  2817UL, 1680UL, 2023UL, 1220UL,
+    1007UL, 3628UL, 1385UL, 519UL,  937UL,  443UL,  1537UL, 1292UL, 87UL,   2151UL, 539UL,  3983UL, 1160UL, 605UL,
+    360UL,  2690UL, 2228UL, 3800UL, 2064UL, 3195UL, 29UL,   3470UL, 1816UL, 646UL,  3561UL, 1644UL, 365UL,  867UL,
+    1775UL, 1535UL, 134UL,  1333UL, 497UL,  3016UL, 846UL,  1903UL, 3343UL, 3051UL, 891UL,  1752UL, 379UL,  676UL,
+    3032UL, 55UL,   2125UL, 680UL,  836UL,  710UL,  3399UL, 2553UL, 3002UL, 350UL,  2427UL, 49UL,   3197UL, 2458UL,
+    386UL,  2418UL, 298UL,  2693UL, 1256UL, 361UL,  1569UL, 200UL,  2830UL, 1842UL, 434UL,  1011UL, 850UL,  2460UL,
+    2265UL, 2663UL, 1255UL, 3364UL, 2595UL, 3050UL, 1945UL, 2315UL, 2129UL, 3254UL, 1819UL, 1468UL, 2301UL, 2412UL,
+    644UL,  78UL,   1517UL, 1391UL, 1787UL, 2504UL, 816UL,  16UL,   1200UL, 2203UL, 1259UL, 3592UL, 3908UL, 3457UL,
+    1854UL, 934UL,  1893UL, 1740UL, 3496UL, 462UL,  1575UL, 2929UL, 1950UL, 1362UL, 3474UL, 3739UL, 2145UL, 1882UL,
+    914UL,  1152UL, 522UL,  1287UL, 3719UL, 2001UL, 3158UL, 1315UL, 2826UL, 1256UL, 3616UL, 283UL,  2506UL, 661UL,
+    2977UL, 3143UL, 1993UL, 3986UL, 3054UL, 915UL,  3581UL, 3557UL, 3148UL, 1209UL, 2861UL, 2944UL, 4092UL, 3184UL,
+    1642UL, 1855UL, 1540UL, 2315UL, 2059UL, 3052UL, 2410UL, 3233UL, 1556UL, 45UL,   3225UL, 3909UL, 3335UL, 351UL,
+    1816UL, 883UL,  3221UL, 2510UL, 2939UL, 2159UL, 2529UL, 2989UL, 588UL,  46UL,   3203UL, 2296UL, 503UL,  2059UL,
+    3743UL, 272UL,  3461UL, 3699UL, 2104UL, 1662UL, 2930UL, 2741UL, 883UL,  981UL,  2397UL, 3317UL, 3756UL, 3721UL,
+    3621UL, 217UL,  2206UL, 4007UL, 1299UL, 2099UL, 710UL,  2292UL, 2151UL, 1037UL, 1329UL, 1614UL, 3582UL, 2957UL,
+    1906UL, 2938UL, 3900UL, 1374UL, 3049UL, 2459UL, 3510UL, 587UL,  963UL,  2894UL, 2215UL, 1918UL, 3910UL, 2726UL,
+    2900UL, 2536UL, 1704UL, 193UL,  3268UL, 406UL,  3879UL, 219UL,  2905UL, 1978UL, 1167UL, 2378UL, 3575UL, 3966UL,
+    1918UL, 472UL,  949UL,  1469UL, 3648UL, 3973UL, 1053UL, 3682UL, 3366UL, 2703UL, 507UL,  2820UL, 1679UL, 2985UL,
+    1979UL, 967UL,  1395UL, 935UL,  1402UL, 2447UL, 2894UL, 1817UL, 2683UL, 125UL,  3578UL, 1965UL, 598UL,  3177UL,
+    3813UL, 1091UL, 896UL,  3436UL, 2053UL, 1834UL, 51UL,   3462UL, 2100UL, 3805UL, 124UL,  797UL,  1085UL, 4073UL,
+    877UL,  395UL,  4089UL, 3300UL, 1765UL, 3425UL, 2354UL, 3619UL, 2288UL, 1355UL, 1599UL, 2849UL, 3945UL, 3483UL,
+    1982UL, 3352UL, 1663UL, 2862UL, 3042UL, 4005UL, 2817UL, 3363UL, 1789UL, 3713UL, 1774UL, 889UL,  2752UL, 874UL,
+    1032UL, 23UL,   2759UL, 1795UL, 1021UL, 2041UL, 4094UL, 2328UL, 188UL,  1536UL, 1753UL, 415UL,  2831UL, 281UL,
+    3650UL, 2155UL,
+};
+uint32_t rand_arr_13_b13_w32_arr[1024] = {
+    553UL,  3668UL, 3873UL, 1890UL, 2816UL, 1684UL, 4179UL, 515UL,  4344UL, 3994UL, 4524UL, 3653UL, 6033UL, 4393UL,
+    4038UL, 1237UL, 588UL,  7223UL, 7077UL, 6474UL, 5198UL, 2595UL, 27UL,   2617UL, 3465UL, 6040UL, 3175UL, 2452UL,
+    2611UL, 2847UL, 5903UL, 762UL,  5086UL, 998UL,  8135UL, 775UL,  148UL,  2066UL, 2377UL, 490UL,  4257UL, 6419UL,
+    7960UL, 4218UL, 7398UL, 1508UL, 7703UL, 663UL,  5894UL, 2266UL, 4655UL, 2501UL, 7907UL, 6271UL, 1703UL, 6240UL,
+    3402UL, 2534UL, 6973UL, 1938UL, 6544UL, 1264UL, 2374UL, 6930UL, 5644UL, 4086UL, 6319UL, 4602UL, 7415UL, 8161UL,
+    7968UL, 6608UL, 4964UL, 2418UL, 4188UL, 1285UL, 4516UL, 1282UL, 8148UL, 3148UL, 1081UL, 3199UL, 6847UL, 1675UL,
+    554UL,  1158UL, 930UL,  2318UL, 2659UL, 4156UL, 6171UL, 6254UL, 1331UL, 4105UL, 4903UL, 3126UL, 6096UL, 3048UL,
+    3008UL, 2940UL, 420UL,  7569UL, 2425UL, 6783UL, 2918UL, 6697UL, 1111UL, 4867UL, 1686UL, 3915UL, 1431UL, 6807UL,
+    7040UL, 7755UL, 1265UL, 2811UL, 7430UL, 4893UL, 902UL,  2698UL, 3485UL, 3949UL, 5712UL, 1953UL, 1210UL, 7758UL,
+    77UL,   5413UL, 4484UL, 884UL,  7959UL, 4508UL, 5968UL, 1939UL, 3930UL, 2555UL, 4026UL, 5611UL, 6400UL, 24UL,
+    7546UL, 5937UL, 7665UL, 5157UL, 1249UL, 5908UL, 3584UL, 5135UL, 8165UL, 5547UL, 7484UL, 4435UL, 741UL,  6167UL,
+    4927UL, 2293UL, 1131UL, 3744UL, 7776UL, 1329UL, 2225UL, 6705UL, 4570UL, 4639UL, 499UL,  405UL,  5969UL, 1499UL,
+    4520UL, 6320UL, 4833UL, 1632UL, 2292UL, 8078UL, 4178UL, 6421UL, 1812UL, 3702UL, 6290UL, 2211UL, 662UL,  6947UL,
+    2755UL, 1510UL, 448UL,  1117UL, 4931UL, 930UL,  280UL,  3616UL, 1081UL, 4319UL, 3131UL, 5593UL, 1857UL, 3448UL,
+    677UL,  2940UL, 1920UL, 5110UL, 6278UL, 170UL,  111UL,  7264UL, 6482UL, 5400UL, 5846UL, 4677UL, 6700UL, 7675UL,
+    3092UL, 7117UL, 5662UL, 6029UL, 3327UL, 7827UL, 6901UL, 4975UL, 3982UL, 4160UL, 3130UL, 1941UL, 1303UL, 7845UL,
+    1189UL, 1638UL, 5766UL, 882UL,  8157UL, 492UL,  7028UL, 47UL,   6303UL, 4146UL, 1227UL, 2509UL, 3869UL, 2706UL,
+    5178UL, 5073UL, 2415UL, 1369UL, 4445UL, 7674UL, 4012UL, 6518UL, 2968UL, 4687UL, 6539UL, 3802UL, 3498UL, 7660UL,
+    4255UL, 2329UL, 7052UL, 6244UL, 7478UL, 7624UL, 4199UL, 7124UL, 1878UL, 4991UL, 5301UL, 3012UL, 2743UL, 4561UL,
+    5113UL, 5932UL, 1585UL, 3212UL, 6281UL, 5497UL, 7980UL, 2298UL, 1802UL, 3856UL, 7040UL, 2234UL, 659UL,  6113UL,
+    3756UL, 7808UL, 3839UL, 1849UL, 4669UL, 7606UL, 7339UL, 4730UL, 1106UL, 667UL,  1817UL, 1706UL, 5594UL, 8062UL,
+    5208UL, 1741UL, 904UL,  4853UL, 4668UL, 7508UL, 1038UL, 500UL,  8119UL, 4496UL, 3608UL, 5526UL, 1751UL, 5421UL,
+    4122UL, 6910UL, 3427UL, 2385UL, 3666UL, 3807UL, 5282UL, 3991UL, 6874UL, 1015UL, 2960UL, 7595UL, 6847UL, 5541UL,
+    2532UL, 5165UL, 5066UL, 8047UL, 2243UL, 2656UL, 3633UL, 1296UL, 3442UL, 6772UL, 6972UL, 7964UL, 3001UL, 1036UL,
+    1157UL, 5008UL, 609UL,  2711UL, 5535UL, 470UL,  4036UL, 6147UL, 4126UL, 6564UL, 3608UL, 6661UL, 6832UL, 4512UL,
+    4844UL, 5611UL, 5863UL, 6594UL, 1358UL, 2699UL, 1802UL, 2678UL, 4090UL, 7161UL, 2208UL, 6212UL, 5275UL, 4676UL,
+    584UL,  6561UL, 6258UL, 4575UL, 688UL,  2097UL, 6222UL, 1875UL, 6443UL, 6721UL, 3353UL, 892UL,  1791UL, 3670UL,
+    8080UL, 6816UL, 8156UL, 5900UL, 4902UL, 7925UL, 3873UL, 1960UL, 4776UL, 7446UL, 3969UL, 5263UL, 4235UL, 755UL,
+    7272UL, 3547UL, 7805UL, 3979UL, 4425UL, 5705UL, 4500UL, 8124UL, 117UL,  1196UL, 5097UL, 7974UL, 592UL,  548UL,
+    2125UL, 2842UL, 2708UL, 4282UL, 5821UL, 2480UL, 3777UL, 97UL,   2818UL, 3129UL, 2359UL, 2713UL, 910UL,  1699UL,
+    1867UL, 7275UL, 1090UL, 2831UL, 5085UL, 3429UL, 7432UL, 2174UL, 3819UL, 7732UL, 3517UL, 1711UL, 1496UL, 6719UL,
+    5726UL, 6549UL, 4974UL, 5938UL, 1342UL, 1476UL, 3480UL, 5183UL, 5305UL, 6743UL, 5075UL, 6460UL, 5614UL, 4226UL,
+    6449UL, 3445UL, 5745UL, 5962UL, 2869UL, 4472UL, 62UL,   754UL,  6726UL, 5236UL, 3832UL, 976UL,  1238UL, 3895UL,
+    3620UL, 3260UL, 5149UL, 6720UL, 2976UL, 7393UL, 5693UL, 6742UL, 3670UL, 5658UL, 6336UL, 7437UL, 124UL,  414UL,
+    1769UL, 2092UL, 6536UL, 6174UL, 6325UL, 3050UL, 7086UL, 8046UL, 5001UL, 5726UL, 1514UL, 5071UL, 3882UL, 2807UL,
+    3361UL, 7639UL, 4823UL, 1258UL, 3931UL, 7795UL, 3039UL, 3569UL, 5882UL, 7551UL, 7324UL, 7677UL, 1800UL, 5555UL,
+    2649UL, 7385UL, 1737UL, 6331UL, 2846UL, 1296UL, 4247UL, 7536UL, 7755UL, 7769UL, 1288UL, 784UL,  253UL,  7099UL,
+    4241UL, 8157UL, 3478UL, 6953UL, 752UL,  5548UL, 7074UL, 2118UL, 6516UL, 3448UL, 8105UL, 1657UL, 5534UL, 5318UL,
+    6466UL, 4761UL, 6963UL, 5927UL, 246UL,  5808UL, 7706UL, 1777UL, 6890UL, 366UL,  1094UL, 4151UL, 2348UL, 692UL,
+    1875UL, 4194UL, 4300UL, 2526UL, 6865UL, 485UL,  3890UL, 6597UL, 3188UL, 6719UL, 5980UL, 2466UL, 1851UL, 5877UL,
+    840UL,  4641UL, 7221UL, 2226UL, 696UL,  273UL,  7870UL, 7238UL, 1849UL, 6781UL, 7810UL, 2198UL, 5395UL, 5367UL,
+    2206UL, 7778UL, 6240UL, 5539UL, 4256UL, 5243UL, 5175UL, 3699UL, 1137UL, 4596UL, 1224UL, 4497UL, 6753UL, 3890UL,
+    4382UL, 775UL,  63UL,   4446UL, 6078UL, 6731UL, 260UL,  156UL,  6592UL, 1113UL, 3609UL, 4611UL, 7965UL, 3720UL,
+    5076UL, 5324UL, 3337UL, 5835UL, 1682UL, 3456UL, 6669UL, 5126UL, 3765UL, 5831UL, 788UL,  2410UL, 2531UL, 7159UL,
+    3369UL, 3369UL, 176UL,  198UL,  4434UL, 1743UL, 4518UL, 3184UL, 3655UL, 190UL,  4089UL, 609UL,  5495UL, 3907UL,
+    2715UL, 1836UL, 3001UL, 2267UL, 1478UL, 8045UL, 7104UL, 355UL,  2177UL, 7910UL, 6495UL, 1911UL, 5411UL, 6541UL,
+    1840UL, 5688UL, 6641UL, 1035UL, 6593UL, 2895UL, 5126UL, 7906UL, 1401UL, 4652UL, 559UL,  7124UL, 5768UL, 133UL,
+    3224UL, 381UL,  4267UL, 4173UL, 1619UL, 1785UL, 3867UL, 1051UL, 5050UL, 1793UL, 7226UL, 1219UL, 4560UL, 4428UL,
+    2213UL, 7484UL, 3213UL, 3871UL, 2123UL, 7171UL, 3793UL, 449UL,  2170UL, 1937UL, 3634UL, 5086UL, 7765UL, 3241UL,
+    5685UL, 2843UL, 7786UL, 584UL,  535UL,  871UL,  890UL,  4450UL, 6893UL, 5655UL, 3503UL, 2644UL, 6274UL, 3287UL,
+    5758UL, 1995UL, 5440UL, 2437UL, 5185UL, 7227UL, 1132UL, 7111UL, 5248UL, 632UL,  6773UL, 4046UL, 3902UL, 3565UL,
+    3357UL, 3370UL, 1678UL, 293UL,  4896UL, 5565UL, 1385UL, 7430UL, 4678UL, 2784UL, 1630UL, 7400UL, 6888UL, 3015UL,
+    260UL,  5251UL, 5916UL, 6680UL, 323UL,  7991UL, 682UL,  8125UL, 3780UL, 2924UL, 7263UL, 4440UL, 6631UL, 591UL,
+    126UL,  3382UL, 5417UL, 2063UL, 2929UL, 7408UL, 218UL,  3693UL, 3892UL, 7549UL, 7139UL, 3984UL, 1235UL, 200UL,
+    4555UL, 2400UL, 1374UL, 2422UL, 4044UL, 7846UL, 4605UL, 1401UL, 7552UL, 4920UL, 4618UL, 2571UL, 889UL,  1159UL,
+    302UL,  1712UL, 5905UL, 7909UL, 2019UL, 2244UL, 897UL,  2716UL, 207UL,  0UL,    2937UL, 2730UL, 2738UL, 7552UL,
+    4127UL, 4063UL, 4336UL, 6721UL, 4081UL, 5714UL, 1123UL, 6845UL, 6768UL, 3857UL, 3462UL, 4617UL, 4686UL, 3586UL,
+    4386UL, 978UL,  1723UL, 3236UL, 2868UL, 5160UL, 8141UL, 5834UL, 556UL,  8017UL, 4675UL, 5413UL, 780UL,  7751UL,
+    5437UL, 568UL,  4446UL, 6235UL, 1941UL, 6517UL, 1085UL, 1493UL, 854UL,  5224UL, 2409UL, 4381UL, 7195UL, 4817UL,
+    6282UL, 7087UL, 5458UL, 3314UL, 3138UL, 694UL,  6824UL, 3242UL, 2465UL, 4542UL, 3996UL, 8037UL, 6707UL, 7977UL,
+    7912UL, 6254UL, 4207UL, 2653UL, 2995UL, 5278UL, 1939UL, 3281UL, 4927UL, 1047UL, 7961UL, 5495UL, 7453UL, 4334UL,
+    4287UL, 2693UL, 3672UL, 5347UL, 7148UL, 4131UL, 3963UL, 5892UL, 3540UL, 1534UL, 464UL,  4137UL, 441UL,  501UL,
+    5841UL, 3593UL, 5507UL, 8145UL, 7470UL, 7340UL, 3189UL, 785UL,  4696UL, 1043UL, 633UL,  1456UL, 7100UL, 5670UL,
+    2126UL, 2127UL, 6079UL, 2682UL, 8072UL, 583UL,  5289UL, 1891UL, 5962UL, 4279UL, 7284UL, 400UL,  4686UL, 4066UL,
+    6147UL, 4289UL, 5618UL, 8084UL, 2892UL, 7502UL, 6414UL, 6766UL, 7410UL, 5263UL, 7322UL, 2619UL, 4807UL, 2028UL,
+    7568UL, 6303UL, 1995UL, 5309UL, 8144UL, 4706UL, 3279UL, 7666UL, 3373UL, 7386UL, 6316UL, 3347UL, 4783UL, 7924UL,
+    2006UL, 4741UL, 6494UL, 4132UL, 7303UL, 1125UL, 6479UL, 550UL,  5788UL, 1161UL, 5159UL, 705UL,  1131UL, 6080UL,
+    4429UL, 613UL,  2714UL, 7853UL, 1755UL, 2147UL, 3905UL, 3800UL, 3133UL, 4701UL, 4104UL, 624UL,  3910UL, 2394UL,
+    5569UL, 5116UL, 5507UL, 731UL,  5330UL, 1641UL, 4088UL, 1853UL, 1959UL, 1915UL, 3867UL, 3785UL, 4628UL, 6170UL,
+    6460UL, 8063UL, 2635UL, 1580UL, 3408UL, 7556UL, 2915UL, 7788UL, 2500UL, 3027UL, 1479UL, 3796UL, 5449UL, 3373UL,
+    2603UL, 3342UL, 6035UL, 7318UL, 6058UL, 2743UL, 614UL,  5414UL, 7184UL, 6510UL, 7323UL, 1708UL, 3323UL, 3768UL,
+    400UL,  7919UL, 2215UL, 4409UL, 5027UL, 2713UL, 3762UL, 5986UL, 7451UL, 7137UL, 857UL,  4134UL, 786UL,  64UL,
+    2757UL, 5130UL, 7221UL, 2992UL, 7081UL, 588UL,  6052UL, 3680UL, 4468UL, 4263UL, 3030UL, 7274UL, 6474UL, 6359UL,
+    6140UL, 600UL,
+};
+uint32_t rand_arr_14_b14_w32_arr[1024] = {
+    13474UL, 6737UL,  14613UL, 2771UL,  4042UL,  4373UL,  15092UL, 7969UL,  9976UL,  6253UL,  11568UL, 11855UL, 13317UL,
+    8061UL,  1434UL,  3551UL,  7623UL,  6324UL,  13589UL, 192UL,   896UL,   11104UL, 9513UL,  9675UL,  5919UL,  11831UL,
+    6235UL,  11330UL, 2412UL,  5609UL,  1729UL,  15626UL, 2243UL,  12862UL, 5129UL,  12550UL, 5230UL,  9916UL,  2864UL,
+    3616UL,  13249UL, 9188UL,  14464UL, 5511UL,  2759UL,  12191UL, 2526UL,  3584UL,  9889UL,  6600UL,  2864UL,  6856UL,
+    1616UL,  6148UL,  10368UL, 8631UL,  5127UL,  14748UL, 3291UL,  583UL,   12036UL, 1366UL,  4049UL,  2072UL,  15657UL,
+    15734UL, 15536UL, 6770UL,  10822UL, 1493UL,  2728UL,  2911UL,  11488UL, 12571UL, 14262UL, 249UL,   4993UL,  6183UL,
+    182UL,   8790UL,  2200UL,  8462UL,  12732UL, 11225UL, 14799UL, 5232UL,  5524UL,  5818UL,  14589UL, 6918UL,  15984UL,
+    5959UL,  597UL,   4848UL,  5912UL,  1164UL,  10828UL, 12763UL, 10724UL, 5817UL,  14731UL, 1667UL,  8651UL,  978UL,
+    7313UL,  11105UL, 2770UL,  9235UL,  5582UL,  5457UL,  10273UL, 8897UL,  10427UL, 16213UL, 7085UL,  12891UL, 4148UL,
+    5786UL,  1801UL,  5634UL,  1623UL,  626UL,   9452UL,  464UL,   2729UL,  9066UL,  12680UL, 6775UL,  16158UL, 13580UL,
+    2916UL,  11027UL, 3033UL,  10390UL, 9213UL,  14650UL, 8154UL,  2090UL,  12300UL, 2083UL,  15930UL, 6445UL,  1410UL,
+    5830UL,  2300UL,  4771UL,  9231UL,  14966UL, 13295UL, 812UL,   9957UL,  7350UL,  13751UL, 15176UL, 11009UL, 9764UL,
+    10583UL, 3851UL,  11078UL, 13748UL, 4763UL,  15921UL, 7930UL,  10570UL, 7855UL,  6292UL,  16362UL, 6696UL,  6526UL,
+    2272UL,  2821UL,  12644UL, 9835UL,  11606UL, 11742UL, 16183UL, 12574UL, 13679UL, 16061UL, 14983UL, 8306UL,  4041UL,
+    1010UL,  7228UL,  15114UL, 9302UL,  6274UL,  7131UL,  568UL,   14373UL, 6053UL,  2420UL,  7029UL,  3351UL,  13964UL,
+    9759UL,  16247UL, 12626UL, 14810UL, 4438UL,  2493UL,  8647UL,  12877UL, 8904UL,  12549UL, 14184UL, 7089UL,  12496UL,
+    8600UL,  8731UL,  9323UL,  10149UL, 13392UL, 14684UL, 6754UL,  3978UL,  11848UL, 2450UL,  14124UL, 4793UL,  7233UL,
+    9633UL,  15015UL, 6863UL,  560UL,   7524UL,  16313UL, 11799UL, 12182UL, 862UL,   12797UL, 14484UL, 2308UL,  11733UL,
+    12031UL, 10915UL, 12734UL, 14141UL, 1445UL,  4498UL,  126UL,   7567UL,  12188UL, 15824UL, 14363UL, 11423UL, 9480UL,
+    12118UL, 8421UL,  8866UL,  8900UL,  6562UL,  15148UL, 1269UL,  9962UL,  2847UL,  1588UL,  9972UL,  6368UL,  14812UL,
+    3236UL,  11096UL, 1185UL,  11516UL, 3259UL,  7086UL,  9821UL,  8847UL,  9750UL,  15218UL, 3729UL,  15210UL, 15016UL,
+    2023UL,  9936UL,  10129UL, 8623UL,  1636UL,  2674UL,  4518UL,  14200UL, 3316UL,  6484UL,  16164UL, 4961UL,  1457UL,
+    2053UL,  6012UL,  13773UL, 181UL,   11201UL, 8748UL,  7184UL,  12306UL, 2690UL,  12943UL, 2350UL,  9699UL,  168UL,
+    13380UL, 9915UL,  3718UL,  12711UL, 15945UL, 2313UL,  15859UL, 1186UL,  5085UL,  12408UL, 9231UL,  12798UL, 5422UL,
+    2763UL,  611UL,   9439UL,  7223UL,  3840UL,  14777UL, 14661UL, 13885UL, 1885UL,  54UL,    2104UL,  9942UL,  5325UL,
+    16125UL, 3102UL,  4095UL,  4008UL,  7399UL,  14458UL, 11897UL, 4452UL,  7519UL,  3430UL,  10975UL, 10387UL, 8303UL,
+    2202UL,  5905UL,  14273UL, 9229UL,  6752UL,  10872UL, 7467UL,  1041UL,  11117UL, 12622UL, 1532UL,  14335UL, 1941UL,
+    9641UL,  12640UL, 10241UL, 2524UL,  1809UL,  15548UL, 10213UL, 5996UL,  2133UL,  13528UL, 2524UL,  14407UL, 14796UL,
+    1202UL,  10316UL, 7964UL,  5637UL,  10368UL, 7260UL,  14433UL, 2142UL,  13648UL, 15405UL, 15735UL, 16133UL, 5838UL,
+    1297UL,  5294UL,  2359UL,  13179UL, 15652UL, 1921UL,  3947UL,  8815UL,  8508UL,  3429UL,  3821UL,  2171UL,  6457UL,
+    6408UL,  7440UL,  1227UL,  14098UL, 3298UL,  6751UL,  4065UL,  1129UL,  4594UL,  12727UL, 1872UL,  13743UL, 4868UL,
+    3023UL,  10859UL, 3143UL,  2750UL,  2680UL,  7348UL,  13985UL, 15732UL, 16312UL, 12367UL, 14423UL, 10858UL, 14278UL,
+    11789UL, 12209UL, 14167UL, 13975UL, 15817UL, 35UL,    15574UL, 13729UL, 4019UL,  15562UL, 5252UL,  3893UL,  6239UL,
+    13369UL, 60UL,    6666UL,  3624UL,  13774UL, 8756UL,  1805UL,  11064UL, 10179UL, 2208UL,  9439UL,  10668UL, 5125UL,
+    3193UL,  9293UL,  7209UL,  4293UL,  5095UL,  12044UL, 2683UL,  12083UL, 6128UL,  8083UL,  10668UL, 15172UL, 6170UL,
+    2690UL,  11110UL, 11415UL, 718UL,   1722UL,  10516UL, 14954UL, 4318UL,  7845UL,  2967UL,  14937UL, 14672UL, 7202UL,
+    3588UL,  13573UL, 4019UL,  2395UL,  6126UL,  12090UL, 12034UL, 460UL,   10704UL, 3996UL,  12120UL, 3883UL,  12159UL,
+    15302UL, 16309UL, 4170UL,  15688UL, 4329UL,  13999UL, 7434UL,  14795UL, 1677UL,  16116UL, 16136UL, 12193UL, 9634UL,
+    12553UL, 6392UL,  390UL,   4512UL,  9044UL,  9377UL,  1181UL,  4513UL,  5602UL,  13827UL, 9536UL,  7531UL,  10112UL,
+    4716UL,  13026UL, 12356UL, 4128UL,  4589UL,  5768UL,  10332UL, 9923UL,  13378UL, 6716UL,  8461UL,  11561UL, 3326UL,
+    4330UL,  7868UL,  2324UL,  8231UL,  10869UL, 2621UL,  11833UL, 809UL,   1897UL,  2841UL,  10692UL, 8905UL,  16108UL,
+    2831UL,  1148UL,  200UL,   14467UL, 10624UL, 343UL,   51UL,    11700UL, 14808UL, 6121UL,  6462UL,  3027UL,  10452UL,
+    11007UL, 8710UL,  11082UL, 5761UL,  4058UL,  10260UL, 9401UL,  4634UL,  2488UL,  2804UL,  8664UL,  1321UL,  15002UL,
+    13074UL, 1864UL,  3733UL,  7126UL,  16276UL, 13306UL, 3293UL,  7886UL,  9626UL,  11300UL, 14375UL, 15880UL, 2619UL,
+    5342UL,  1225UL,  11024UL, 6075UL,  1384UL,  13659UL, 12544UL, 10652UL, 15647UL, 16295UL, 14109UL, 13090UL, 12310UL,
+    2585UL,  2316UL,  12000UL, 6702UL,  8418UL,  15540UL, 14352UL, 1439UL,  12574UL, 10882UL, 7342UL,  7914UL,  7148UL,
+    2249UL,  1831UL,  9759UL,  2232UL,  10398UL, 13764UL, 1315UL,  1679UL,  12704UL, 11036UL, 9413UL,  14668UL, 3496UL,
+    9258UL,  566UL,   15963UL, 6934UL,  7014UL,  14810UL, 947UL,   1679UL,  12103UL, 1881UL,  16135UL, 11511UL, 9009UL,
+    1846UL,  14199UL, 15646UL, 16105UL, 14994UL, 16125UL, 1158UL,  6309UL,  3595UL,  2900UL,  6309UL,  3094UL,  1077UL,
+    6895UL,  10327UL, 5883UL,  5618UL,  7544UL,  614UL,   12410UL, 4673UL,  3736UL,  1169UL,  15014UL, 6027UL,  5552UL,
+    9381UL,  10525UL, 10840UL, 3909UL,  11601UL, 5229UL,  14759UL, 11097UL, 4196UL,  16292UL, 15223UL, 6612UL,  11227UL,
+    13712UL, 12614UL, 13013UL, 12821UL, 15235UL, 10737UL, 2977UL,  1348UL,  651UL,   1811UL,  7775UL,  11650UL, 10804UL,
+    15723UL, 2209UL,  14288UL, 8844UL,  7643UL,  11515UL, 11678UL, 14128UL, 192UL,   12919UL, 14879UL, 5761UL,  13560UL,
+    7547UL,  7084UL,  3299UL,  1217UL,  159UL,   3841UL,  11515UL, 13197UL, 5410UL,  3674UL,  5444UL,  15457UL, 7355UL,
+    9914UL,  7644UL,  11601UL, 6994UL,  1064UL,  3073UL,  9886UL,  1416UL,  15971UL, 7627UL,  2824UL,  10918UL, 928UL,
+    5296UL,  561UL,   6903UL,  16077UL, 4792UL,  7977UL,  15017UL, 1413UL,  9625UL,  16197UL, 11006UL, 7786UL,  15381UL,
+    9288UL,  15740UL, 11128UL, 12138UL, 9495UL,  4983UL,  2104UL,  5601UL,  6812UL,  6684UL,  10733UL, 4829UL,  6803UL,
+    10866UL, 16110UL, 13890UL, 1923UL,  11082UL, 15842UL, 14762UL, 12275UL, 3834UL,  9363UL,  14484UL, 8794UL,  2472UL,
+    12963UL, 8578UL,  5681UL,  15149UL, 10167UL, 8533UL,  5214UL,  7696UL,  5724UL,  12549UL, 13769UL, 4834UL,  6905UL,
+    7721UL,  12427UL, 3034UL,  3640UL,  2067UL,  867UL,   9385UL,  13628UL, 9168UL,  4065UL,  2512UL,  10871UL, 8823UL,
+    12149UL, 12532UL, 10439UL, 13025UL, 2508UL,  6397UL,  5478UL,  8687UL,  7927UL,  15479UL, 10195UL, 3343UL,  7923UL,
+    6173UL,  5972UL,  4095UL,  13906UL, 11172UL, 7664UL,  15970UL, 14538UL, 8893UL,  14861UL, 1237UL,  5715UL,  5685UL,
+    14857UL, 80UL,    12566UL, 1815UL,  13072UL, 14603UL, 6838UL,  8826UL,  9835UL,  13398UL, 10042UL, 15479UL, 11441UL,
+    10965UL, 8090UL,  14361UL, 79UL,    11466UL, 15761UL, 57UL,    11500UL, 14595UL, 15849UL, 221UL,   12068UL, 4723UL,
+    9774UL,  2892UL,  14615UL, 1738UL,  5089UL,  2002UL,  3915UL,  8673UL,  897UL,   614UL,   1272UL,  2447UL,  11280UL,
+    4389UL,  13202UL, 2699UL,  7025UL,  15520UL, 8134UL,  12534UL, 5139UL,  10125UL, 9795UL,  3050UL,  7722UL,  15001UL,
+    343UL,   14684UL, 5603UL,  911UL,   5285UL,  11673UL, 4593UL,  6136UL,  15514UL, 3620UL,  5113UL,  6017UL,  3120UL,
+    1938UL,  14218UL, 9866UL,  13904UL, 330UL,   7698UL,  11085UL, 16132UL, 14779UL, 3988UL,  15997UL, 10027UL, 5085UL,
+    5917UL,  6641UL,  123UL,   15448UL, 5879UL,  14398UL, 11307UL, 1004UL,  10457UL, 8551UL,  13259UL, 11387UL, 11430UL,
+    5120UL,  6110UL,  7627UL,  11233UL, 11267UL, 12359UL, 4123UL,  14956UL, 5762UL,  13347UL, 260UL,   7896UL,  16122UL,
+    1881UL,  2914UL,  10118UL, 1578UL,  6214UL,  4978UL,  3580UL,  9302UL,  12375UL, 12187UL, 10188UL, 10709UL, 13955UL,
+    14142UL, 6411UL,  4185UL,  6557UL,  8020UL,  7775UL,  1995UL,  12913UL, 15667UL, 12923UL, 7224UL,  11027UL, 13628UL,
+    1044UL,  60UL,    14264UL, 16140UL, 15199UL, 5524UL,  60UL,    1151UL,  6472UL,  198UL,   5813UL,  2882UL,  3606UL,
+    7449UL,  2453UL,  12225UL, 9040UL,  4620UL,  11130UL, 7188UL,  2007UL,  12148UL, 8397UL,  1123UL,  12868UL, 10298UL,
+    3008UL,  3619UL,  12672UL, 2467UL,  2813UL,  111UL,   13881UL, 70UL,    12306UL, 3168UL,  6728UL,  11844UL, 10336UL,
+    2898UL,  6345UL,  4746UL,  2948UL,  6804UL,  16356UL, 8556UL,  59UL,    3927UL,  9098UL,  13142UL, 7599UL,  2307UL,
+    12889UL, 12531UL, 13245UL, 9171UL,  4067UL,  2318UL,  1015UL,  7454UL,  14553UL, 3229UL,  12741UL, 14857UL, 14801UL,
+    14885UL, 810UL,   3768UL,  1220UL,  185UL,   5321UL,  4513UL,  12117UL, 12159UL, 15984UL, 3986UL,  3386UL,  5174UL,
+    3883UL,  4396UL,  8985UL,  13435UL, 10928UL, 15108UL, 14165UL, 13182UL, 14627UL, 5837UL,
+};
+uint32_t rand_arr_15_b15_w32_arr[1024] = {
+    27070UL, 28090UL, 7727UL,  8783UL,  12889UL, 6967UL,  17977UL, 5032UL,  14768UL, 13768UL, 15271UL, 6046UL,  23504UL,
+    5477UL,  18437UL, 19183UL, 17646UL, 21367UL, 14965UL, 5038UL,  6986UL,  7752UL,  18562UL, 11921UL, 6572UL,  30065UL,
+    29868UL, 8035UL,  25799UL, 17025UL, 21026UL, 5051UL,  30663UL, 23856UL, 10313UL, 10097UL, 29396UL, 25096UL, 918UL,
+    8442UL,  4646UL,  2114UL,  18171UL, 5594UL,  17892UL, 16257UL, 3785UL,  30120UL, 13263UL, 19923UL, 18852UL, 22408UL,
+    9328UL,  5573UL,  28776UL, 20182UL, 21751UL, 26209UL, 30633UL, 5143UL,  23928UL, 30026UL, 14680UL, 22530UL, 31777UL,
+    27183UL, 1305UL,  3205UL,  10576UL, 22900UL, 9926UL,  2984UL,  13115UL, 14357UL, 32563UL, 19093UL, 25504UL, 6071UL,
+    14655UL, 12317UL, 590UL,   23515UL, 29884UL, 1484UL,  8098UL,  24215UL, 31526UL, 18380UL, 5852UL,  32106UL, 2617UL,
+    15600UL, 23323UL, 10383UL, 21552UL, 6077UL,  18959UL, 14518UL, 22364UL, 13577UL, 7463UL,  28548UL, 9913UL,  9411UL,
+    8238UL,  15056UL, 30655UL, 15799UL, 17435UL, 25349UL, 8096UL,  9847UL,  23576UL, 28400UL, 26649UL, 10115UL, 25762UL,
+    12852UL, 10818UL, 25465UL, 2117UL,  13293UL, 19098UL, 24920UL, 1472UL,  1116UL,  19556UL, 20840UL, 18555UL, 29464UL,
+    5894UL,  14522UL, 19819UL, 3143UL,  7460UL,  5180UL,  20473UL, 12912UL, 25062UL, 6617UL,  4102UL,  22579UL, 29603UL,
+    25498UL, 2871UL,  21382UL, 12020UL, 28621UL, 20510UL, 30023UL, 15583UL, 25240UL, 12046UL, 25836UL, 2909UL,  26245UL,
+    10481UL, 28447UL, 14065UL, 30109UL, 16045UL, 29995UL, 4005UL,  18695UL, 12674UL, 12520UL, 25964UL, 14571UL, 11340UL,
+    27924UL, 8588UL,  16173UL, 1565UL,  29894UL, 13737UL, 4089UL,  7987UL,  29565UL, 28797UL, 14235UL, 16555UL, 22663UL,
+    29759UL, 26584UL, 23968UL, 25986UL, 9518UL,  3355UL,  22953UL, 20548UL, 17332UL, 12816UL, 18220UL, 13497UL, 3755UL,
+    5703UL,  20540UL, 13289UL, 23585UL, 24525UL, 21820UL, 23222UL, 29991UL, 6999UL,  5050UL,  26434UL, 26968UL, 7003UL,
+    29405UL, 20960UL, 6259UL,  21923UL, 16930UL, 31075UL, 637UL,   23309UL, 12973UL, 12598UL, 1691UL,  27748UL, 9663UL,
+    8555UL,  32592UL, 2691UL,  1007UL,  18889UL, 11883UL, 32637UL, 22117UL, 11679UL, 31122UL, 1859UL,  29423UL, 24300UL,
+    8585UL,  11477UL, 17085UL, 7780UL,  22333UL, 19390UL, 24288UL, 32571UL, 10341UL, 12104UL, 9953UL,  22986UL, 19336UL,
+    3949UL,  24533UL, 2861UL,  18582UL, 30932UL, 4637UL,  29132UL, 32194UL, 25770UL, 23165UL, 4129UL,  28772UL, 2904UL,
+    29323UL, 1163UL,  14312UL, 4580UL,  16517UL, 19943UL, 9981UL,  18929UL, 4836UL,  14681UL, 12380UL, 22542UL, 11831UL,
+    8245UL,  20705UL, 21297UL, 31902UL, 21971UL, 17680UL, 6504UL,  29114UL, 20065UL, 11680UL, 2756UL,  28063UL, 9801UL,
+    18043UL, 32042UL, 2164UL,  23472UL, 19559UL, 22692UL, 4999UL,  25048UL, 27240UL, 2223UL,  22428UL, 405UL,   22879UL,
+    870UL,   26630UL, 2287UL,  24949UL, 12697UL, 32579UL, 15082UL, 24108UL, 31649UL, 26124UL, 24885UL, 11194UL, 8558UL,
+    3790UL,  14803UL, 10154UL, 11725UL, 6273UL,  23410UL, 28272UL, 29596UL, 22748UL, 27723UL, 31518UL, 8939UL,  32421UL,
+    5434UL,  26126UL, 28213UL, 28468UL, 16074UL, 3505UL,  20171UL, 30026UL, 20800UL, 10648UL, 6310UL,  14984UL, 31695UL,
+    32016UL, 12732UL, 1266UL,  20007UL, 21377UL, 10477UL, 9764UL,  8719UL,  29605UL, 25888UL, 13633UL, 16014UL, 286UL,
+    9378UL,  7500UL,  25388UL, 29257UL, 21603UL, 28262UL, 21421UL, 23876UL, 20173UL, 27083UL, 28847UL, 1567UL,  6505UL,
+    26765UL, 18841UL, 4325UL,  30480UL, 18194UL, 30656UL, 31620UL, 18001UL, 10875UL, 9073UL,  25834UL, 24395UL, 440UL,
+    14165UL, 20793UL, 22183UL, 8045UL,  3815UL,  9411UL,  5588UL,  7662UL,  18962UL, 2812UL,  11974UL, 18963UL, 21589UL,
+    15113UL, 31257UL, 16786UL, 8380UL,  21823UL, 30386UL, 14985UL, 22029UL, 31308UL, 17669UL, 20655UL, 18953UL, 14811UL,
+    4684UL,  1096UL,  10513UL, 2621UL,  10860UL, 5766UL,  7495UL,  9063UL,  25646UL, 27271UL, 955UL,   29655UL, 20681UL,
+    7860UL,  25694UL, 18366UL, 14743UL, 10926UL, 813UL,   13136UL, 14508UL, 30139UL, 19119UL, 15419UL, 12397UL, 26819UL,
+    25856UL, 2849UL,  25442UL, 668UL,   23419UL, 24877UL, 29452UL, 4943UL,  21483UL, 18989UL, 4862UL,  27714UL, 29194UL,
+    1987UL,  14427UL, 15612UL, 4369UL,  537UL,   24176UL, 6210UL,  5217UL,  22226UL, 16361UL, 21886UL, 21990UL, 17423UL,
+    25730UL, 21856UL, 6371UL,  28100UL, 6999UL,  6136UL,  19278UL, 7213UL,  1719UL,  10199UL, 27129UL, 14199UL, 8993UL,
+    8014UL,  16560UL, 28742UL, 24789UL, 10453UL, 26265UL, 9580UL,  30027UL, 19267UL, 2419UL,  4873UL,  19083UL, 22513UL,
+    30408UL, 9159UL,  23826UL, 5865UL,  17466UL, 29380UL, 6077UL,  10428UL, 2846UL,  28030UL, 22094UL, 16843UL, 11402UL,
+    30531UL, 23219UL, 32143UL, 3998UL,  28924UL, 21904UL, 13734UL, 32636UL, 29718UL, 19470UL, 3904UL,  28594UL, 28480UL,
+    19926UL, 3418UL,  28550UL, 5443UL,  10925UL, 24278UL, 1927UL,  28937UL, 13418UL, 12218UL, 1759UL,  1547UL,  17825UL,
+    12056UL, 24262UL, 11580UL, 8036UL,  14165UL, 31494UL, 23107UL, 10510UL, 4363UL,  20414UL, 2803UL,  22193UL, 27730UL,
+    27167UL, 27818UL, 8300UL,  28319UL, 18680UL, 18790UL, 26010UL, 29182UL, 25569UL, 6436UL,  23300UL, 4801UL,  23021UL,
+    26325UL, 13324UL, 5352UL,  5622UL,  27041UL, 29796UL, 30610UL, 16306UL, 11956UL, 9597UL,  28321UL, 14223UL, 27029UL,
+    5532UL,  13133UL, 6302UL,  1610UL,  4471UL,  19443UL, 15482UL, 25421UL, 4120UL,  24492UL, 15843UL, 19920UL, 8311UL,
+    10406UL, 5573UL,  3673UL,  29999UL, 8934UL,  24215UL, 9676UL,  24996UL, 24654UL, 17834UL, 30209UL, 19058UL, 14610UL,
+    21089UL, 22530UL, 7611UL,  16190UL, 4884UL,  25760UL, 2233UL,  11619UL, 9599UL,  26720UL, 5605UL,  32275UL, 12401UL,
+    13233UL, 8752UL,  26594UL, 9196UL,  2463UL,  25832UL, 1576UL,  32489UL, 21140UL, 22976UL, 13286UL, 13928UL, 14664UL,
+    18783UL, 14187UL, 11571UL, 2262UL,  26983UL, 10837UL, 26370UL, 152UL,   20959UL, 28025UL, 6133UL,  5982UL,  3032UL,
+    11215UL, 29407UL, 25165UL, 14217UL, 32566UL, 3680UL,  23190UL, 18116UL, 24445UL, 3568UL,  6399UL,  13505UL, 4054UL,
+    6977UL,  22614UL, 10423UL, 29707UL, 22461UL, 25770UL, 14529UL, 14055UL, 32683UL, 20526UL, 3991UL,  17455UL, 13772UL,
+    28349UL, 10131UL, 29048UL, 30213UL, 8001UL,  20903UL, 3360UL,  13951UL, 24869UL, 10066UL, 17091UL, 26420UL, 13880UL,
+    11339UL, 5376UL,  11510UL, 6716UL,  32527UL, 30288UL, 20251UL, 27169UL, 26419UL, 28618UL, 642UL,   31893UL, 29916UL,
+    29349UL, 14668UL, 7044UL,  18281UL, 3135UL,  7594UL,  12110UL, 27873UL, 11311UL, 20595UL, 24522UL, 8778UL,  20269UL,
+    3787UL,  30475UL, 28390UL, 25455UL, 4612UL,  19714UL, 16765UL, 3860UL,  2898UL,  16919UL, 23837UL, 10480UL, 24486UL,
+    7564UL,  9769UL,  25728UL, 5580UL,  30912UL, 27039UL, 29984UL, 23498UL, 23950UL, 3499UL,  16146UL, 2046UL,  15833UL,
+    2579UL,  32376UL, 14797UL, 5406UL,  28986UL, 19547UL, 30835UL, 26030UL, 31023UL, 16686UL, 14412UL, 10714UL, 12810UL,
+    3261UL,  31317UL, 26521UL, 8489UL,  5858UL,  2971UL,  28783UL, 20768UL, 14733UL, 8752UL,  20295UL, 28725UL, 3814UL,
+    27230UL, 3324UL,  26098UL, 1839UL,  13320UL, 1525UL,  21089UL, 9691UL,  10847UL, 8322UL,  5975UL,  31589UL, 19348UL,
+    24957UL, 2262UL,  27506UL, 27958UL, 9784UL,  23839UL, 31681UL, 21912UL, 508UL,   2159UL,  28133UL, 26111UL, 21436UL,
+    10584UL, 24989UL, 2290UL,  26826UL, 26855UL, 31854UL, 15663UL, 21698UL, 13736UL, 17238UL, 3193UL,  27210UL, 11771UL,
+    5186UL,  21326UL, 31359UL, 13540UL, 21456UL, 974UL,   5288UL,  16272UL, 9294UL,  13077UL, 24913UL, 4774UL,  1198UL,
+    19175UL, 12799UL, 21055UL, 25796UL, 18861UL, 23198UL, 25034UL, 2865UL,  23012UL, 21028UL, 18719UL, 3410UL,  17664UL,
+    20188UL, 12624UL, 6384UL,  27998UL, 15402UL, 21326UL, 26014UL, 10527UL, 13334UL, 7253UL,  8223UL,  8368UL,  7484UL,
+    20579UL, 15383UL, 14442UL, 19780UL, 3085UL,  21860UL, 14783UL, 7796UL,  11738UL, 24710UL, 9393UL,  11846UL, 19481UL,
+    19200UL, 2413UL,  17595UL, 22785UL, 14867UL, 4042UL,  8118UL,  32107UL, 27199UL, 2581UL,  1918UL,  6482UL,  1581UL,
+    27581UL, 30715UL, 10252UL, 20454UL, 29914UL, 11470UL, 17540UL, 12004UL, 31601UL, 12513UL, 20189UL, 1757UL,  8628UL,
+    5738UL,  32758UL, 9496UL,  25607UL, 888UL,   6310UL,  28536UL, 2630UL,  16752UL, 28443UL, 1593UL,  14059UL, 7168UL,
+    1977UL,  12156UL, 24794UL, 24507UL, 20184UL, 20103UL, 30134UL, 7557UL,  8759UL,  12753UL, 16481UL, 3700UL,  18759UL,
+    18654UL, 29350UL, 8604UL,  1028UL,  29551UL, 24488UL, 14691UL, 20628UL, 30760UL, 14783UL, 21245UL, 18704UL, 26538UL,
+    31585UL, 2601UL,  13680UL, 8761UL,  14703UL, 9312UL,  27153UL, 9089UL,  17820UL, 18627UL, 1927UL,  29418UL, 4861UL,
+    28646UL, 5729UL,  14432UL, 6539UL,  3580UL,  20940UL, 8431UL,  7500UL,  22716UL, 5831UL,  10790UL, 10611UL, 6920UL,
+    32643UL, 26523UL, 5379UL,  27989UL, 8347UL,  2070UL,  5712UL,  24412UL, 26463UL, 9203UL,  26168UL, 10262UL, 22859UL,
+    22140UL, 9957UL,  24488UL, 7169UL,  8717UL,  3819UL,  29950UL, 19361UL, 31807UL, 32008UL, 1151UL,  28549UL, 15658UL,
+    18262UL, 13915UL, 8476UL,  25013UL, 9756UL,  3249UL,  10170UL, 13500UL, 9496UL,  7179UL,  26346UL, 13152UL, 21584UL,
+    5600UL,  8661UL,  12326UL, 31974UL, 9511UL,  18582UL, 10060UL, 14696UL, 12729UL, 5186UL,  12752UL, 26679UL, 14688UL,
+    10549UL, 14483UL, 28279UL, 30279UL, 11068UL, 7595UL,  9844UL,  14954UL, 25827UL, 19237UL, 16539UL, 23036UL, 4824UL,
+    4111UL,  17932UL, 19393UL, 18836UL, 20146UL, 1924UL,  20568UL, 17027UL, 2047UL,  21576UL, 3379UL,  30168UL, 3770UL,
+    7626UL,  12364UL, 30102UL, 14241UL, 508UL,   14964UL, 29687UL, 30828UL, 17423UL, 7403UL,  23586UL, 9433UL,  15696UL,
+    15153UL, 13912UL, 30685UL, 20797UL, 5807UL,  22955UL, 27603UL, 128UL,   5905UL,  3266UL,
+};
+uint32_t rand_arr_16_b16_w32_arr[1024] = {
+    4966UL,  6480UL,  56306UL, 38681UL, 24193UL, 23484UL, 57353UL, 62238UL, 53477UL, 13950UL, 10258UL, 61992UL, 46963UL,
+    39700UL, 55023UL, 32526UL, 55133UL, 42673UL, 27557UL, 30006UL, 21407UL, 45452UL, 33536UL, 43376UL, 22332UL, 3284UL,
+    23352UL, 53858UL, 4073UL,  42593UL, 21747UL, 18879UL, 49767UL, 62759UL, 4547UL,  48150UL, 51093UL, 16244UL, 50215UL,
+    23536UL, 4535UL,  48026UL, 6987UL,  12510UL, 18619UL, 13240UL, 17432UL, 21123UL, 25036UL, 18442UL, 51253UL, 48101UL,
+    65014UL, 40478UL, 17870UL, 29350UL, 9464UL,  13207UL, 29428UL, 29940UL, 6685UL,  33473UL, 63547UL, 56740UL, 63614UL,
+    35708UL, 33948UL, 24466UL, 52456UL, 31254UL, 3258UL,  51995UL, 55207UL, 58311UL, 62210UL, 18891UL, 7513UL,  42409UL,
+    42578UL, 25424UL, 25761UL, 9970UL,  25843UL, 26056UL, 48803UL, 3338UL,  47654UL, 29845UL, 1085UL,  22697UL, 2251UL,
+    41015UL, 48126UL, 54972UL, 63054UL, 14432UL, 827UL,   6069UL,  23678UL, 21651UL, 13152UL, 27264UL, 18268UL, 19863UL,
+    25360UL, 11349UL, 36296UL, 45167UL, 55177UL, 15747UL, 15742UL, 7441UL,  12160UL, 26233UL, 54427UL, 54958UL, 23568UL,
+    9091UL,  33313UL, 15566UL, 15547UL, 148UL,   43222UL, 18684UL, 9611UL,  25004UL, 49010UL, 47124UL, 49019UL, 48967UL,
+    49990UL, 64826UL, 24795UL, 9252UL,  28078UL, 17057UL, 17325UL, 60977UL, 34018UL, 11912UL, 29219UL, 20554UL, 30150UL,
+    49933UL, 13607UL, 11297UL, 47361UL, 41250UL, 43463UL, 13007UL, 6454UL,  17515UL, 32204UL, 10901UL, 16UL,    27775UL,
+    2563UL,  1078UL,  7354UL,  33050UL, 32047UL, 60862UL, 15142UL, 23987UL, 27423UL, 47440UL, 45215UL, 14485UL, 12341UL,
+    41141UL, 2649UL,  21059UL, 58194UL, 62438UL, 7653UL,  55774UL, 55385UL, 13724UL, 6688UL,  23235UL, 20750UL, 28962UL,
+    1546UL,  38474UL, 16427UL, 13916UL, 38442UL, 47978UL, 6778UL,  28304UL, 10656UL, 57941UL, 31264UL, 42056UL, 20147UL,
+    49393UL, 19493UL, 33903UL, 5411UL,  21652UL, 54291UL, 24044UL, 100UL,   60510UL, 57663UL, 23543UL, 30706UL, 55977UL,
+    32669UL, 20358UL, 20081UL, 5439UL,  10580UL, 25335UL, 13831UL, 21140UL, 61128UL, 50344UL, 58249UL, 63633UL, 11722UL,
+    51053UL, 4197UL,  8197UL,  37517UL, 59524UL, 23891UL, 29737UL, 23535UL, 14719UL, 53272UL, 34103UL, 23178UL, 12747UL,
+    40470UL, 1987UL,  23411UL, 47672UL, 40507UL, 29304UL, 16662UL, 32059UL, 3275UL,  59827UL, 47694UL, 36746UL, 20477UL,
+    3236UL,  46386UL, 12044UL, 50645UL, 1232UL,  8225UL,  11684UL, 5742UL,  10156UL, 19343UL, 19420UL, 57557UL, 62513UL,
+    42709UL, 18582UL, 45215UL, 20362UL, 57600UL, 20789UL, 20436UL, 52037UL, 21585UL, 53227UL, 36280UL, 44831UL, 19571UL,
+    49035UL, 28598UL, 16554UL, 48810UL, 59126UL, 63824UL, 24213UL, 64047UL, 9285UL,  28119UL, 22135UL, 1074UL,  9631UL,
+    27791UL, 23731UL, 24538UL, 65260UL, 3097UL,  24176UL, 23426UL, 9545UL,  16001UL, 4012UL,  25045UL, 5088UL,  55739UL,
+    12104UL, 30802UL, 9567UL,  428UL,   59297UL, 36004UL, 40309UL, 7005UL,  11063UL, 56663UL, 12847UL, 53938UL, 5333UL,
+    60519UL, 34991UL, 43349UL, 26021UL, 4811UL,  40052UL, 7682UL,  219UL,   30728UL, 42844UL, 11551UL, 29458UL, 53979UL,
+    10119UL, 2620UL,  1023UL,  22314UL, 2640UL,  58941UL, 60111UL, 21122UL, 44184UL, 17588UL, 55775UL, 35986UL, 9017UL,
+    7242UL,  42343UL, 22978UL, 45585UL, 42441UL, 11763UL, 8088UL,  38682UL, 30131UL, 53074UL, 26985UL, 11860UL, 53707UL,
+    54324UL, 62764UL, 48656UL, 26368UL, 39302UL, 33579UL, 16084UL, 16146UL, 32498UL, 40048UL, 48779UL, 5333UL,  48677UL,
+    11623UL, 59979UL, 27999UL, 59861UL, 12879UL, 20289UL, 17548UL, 21846UL, 59288UL, 12050UL, 20120UL, 10527UL, 2586UL,
+    22171UL, 27742UL, 26137UL, 39435UL, 25098UL, 29055UL, 11652UL, 7845UL,  30219UL, 47090UL, 51396UL, 58596UL, 47788UL,
+    7632UL,  24357UL, 48490UL, 59392UL, 18343UL, 63921UL, 8839UL,  50825UL, 8014UL,  52351UL, 21577UL, 47125UL, 31223UL,
+    25937UL, 15779UL, 3923UL,  10467UL, 50128UL, 42077UL, 45211UL, 54611UL, 64695UL, 48871UL, 22707UL, 24506UL, 16581UL,
+    47213UL, 22839UL, 32537UL, 2478UL,  8414UL,  6592UL,  20748UL, 31637UL, 34508UL, 3429UL,  8372UL,  12299UL, 54645UL,
+    18428UL, 44289UL, 43301UL, 47665UL, 13451UL, 34206UL, 53378UL, 15861UL, 31555UL, 715UL,   63841UL, 20941UL, 25914UL,
+    6428UL,  55587UL, 35271UL, 46466UL, 49145UL, 29079UL, 4620UL,  38435UL, 28329UL, 64921UL, 35452UL, 29822UL, 20598UL,
+    48458UL, 60986UL, 21904UL, 31178UL, 21822UL, 59948UL, 63847UL, 2989UL,  35824UL, 37227UL, 12805UL, 1151UL,  41459UL,
+    48436UL, 38299UL, 32869UL, 57164UL, 36364UL, 19128UL, 46671UL, 7023UL,  24722UL, 32474UL, 24808UL, 36512UL, 27182UL,
+    11327UL, 52534UL, 18972UL, 10805UL, 38832UL, 4260UL,  22447UL, 14209UL, 16058UL, 31605UL, 25143UL, 39083UL, 29621UL,
+    58295UL, 23254UL, 49920UL, 32281UL, 3183UL,  41096UL, 64864UL, 29819UL, 7586UL,  20293UL, 16965UL, 48713UL, 37575UL,
+    36313UL, 26401UL, 16826UL, 4337UL,  34597UL, 47436UL, 56053UL, 3269UL,  1253UL,  6231UL,  14297UL, 43183UL, 34190UL,
+    21894UL, 33159UL, 62038UL, 17734UL, 5935UL,  22831UL, 45340UL, 51908UL, 23498UL, 2804UL,  47153UL, 37193UL, 6596UL,
+    4118UL,  43984UL, 44828UL, 60454UL, 50158UL, 25349UL, 26999UL, 51238UL, 4779UL,  15218UL, 16702UL, 60759UL, 20386UL,
+    52230UL, 8393UL,  35891UL, 4656UL,  53360UL, 44083UL, 29569UL, 17564UL, 13328UL, 15382UL, 20362UL, 26040UL, 45704UL,
+    42429UL, 43226UL, 47456UL, 49954UL, 29747UL, 16434UL, 62247UL, 2912UL,  8528UL,  39027UL, 47021UL, 61167UL, 48545UL,
+    43194UL, 60933UL, 18466UL, 31944UL, 25886UL, 62520UL, 31223UL, 20007UL, 562UL,   20080UL, 24905UL, 42375UL, 37103UL,
+    55906UL, 52009UL, 63246UL, 53352UL, 60070UL, 63543UL, 44815UL, 64736UL, 25434UL, 58434UL, 22377UL, 745UL,   44673UL,
+    37935UL, 64020UL, 17856UL, 6509UL,  24937UL, 61112UL, 23257UL, 41107UL, 24771UL, 53921UL, 28925UL, 54603UL, 9938UL,
+    9897UL,  47970UL, 22679UL, 44107UL, 19708UL, 56955UL, 8732UL,  3271UL,  6066UL,  42818UL, 52348UL, 17546UL, 6818UL,
+    60716UL, 24322UL, 23972UL, 13804UL, 34039UL, 4301UL,  37502UL, 22808UL, 58224UL, 32576UL, 22930UL, 37697UL, 10345UL,
+    19694UL, 16004UL, 20333UL, 43440UL, 61831UL, 6747UL,  48409UL, 46062UL, 56989UL, 4555UL,  37516UL, 37812UL, 41935UL,
+    21688UL, 50468UL, 28718UL, 49751UL, 17438UL, 3570UL,  1800UL,  55338UL, 48791UL, 858UL,   5076UL,  59800UL, 10868UL,
+    4813UL,  40468UL, 25705UL, 34860UL, 35514UL, 29929UL, 25797UL, 34531UL, 47553UL, 979UL,   30225UL, 10800UL, 10766UL,
+    12281UL, 14646UL, 20959UL, 36597UL, 32565UL, 21439UL, 16280UL, 14418UL, 40632UL, 44583UL, 23855UL, 23929UL, 40815UL,
+    2673UL,  61249UL, 18177UL, 36546UL, 27788UL, 25170UL, 32400UL, 13564UL, 24328UL, 20291UL, 10987UL, 9496UL,  10192UL,
+    1380UL,  4366UL,  21489UL, 60404UL, 44910UL, 46937UL, 14570UL, 17334UL, 14443UL, 60409UL, 22862UL, 24782UL, 11503UL,
+    63770UL, 6969UL,  63407UL, 65154UL, 10255UL, 63376UL, 2644UL,  57935UL, 63428UL, 41567UL, 40951UL, 8207UL,  17885UL,
+    9668UL,  14621UL, 29556UL, 10437UL, 26192UL, 48666UL, 13569UL, 21507UL, 24724UL, 23700UL, 61594UL, 22762UL, 51195UL,
+    53481UL, 44223UL, 47225UL, 8424UL,  64682UL, 12745UL, 28861UL, 44275UL, 41850UL, 57334UL, 13444UL, 1389UL,  2719UL,
+    57772UL, 31043UL, 32273UL, 47761UL, 20923UL, 60441UL, 17865UL, 49358UL, 17586UL, 18705UL, 27644UL, 43957UL, 22157UL,
+    64411UL, 28307UL, 55371UL, 24427UL, 57553UL, 40070UL, 18554UL, 31795UL, 24190UL, 3449UL,  50804UL, 56997UL, 16885UL,
+    44083UL, 45382UL, 47596UL, 40609UL, 26916UL, 61935UL, 33678UL, 43877UL, 10346UL, 36039UL, 49297UL, 47580UL, 2058UL,
+    3545UL,  49278UL, 57632UL, 11675UL, 36481UL, 42497UL, 62668UL, 36128UL, 15038UL, 2308UL,  16459UL, 29115UL, 38394UL,
+    30295UL, 16294UL, 34192UL, 37606UL, 5434UL,  53036UL, 40438UL, 1154UL,  31861UL, 22858UL, 47247UL, 60764UL, 355UL,
+    61702UL, 894UL,   39728UL, 33038UL, 2921UL,  37216UL, 19978UL, 65195UL, 44064UL, 26991UL, 46230UL, 44616UL, 18554UL,
+    5631UL,  8277UL,  55454UL, 39604UL, 58942UL, 2697UL,  47089UL, 48083UL, 25531UL, 56793UL, 45231UL, 49229UL, 9202UL,
+    48843UL, 41843UL, 18304UL, 14399UL, 15623UL, 41463UL, 54991UL, 6746UL,  10975UL, 50174UL, 21967UL, 60448UL, 46988UL,
+    29785UL, 21787UL, 41922UL, 10856UL, 48798UL, 49262UL, 535UL,   64418UL, 18746UL, 21533UL, 29042UL, 42573UL, 65412UL,
+    2542UL,  28275UL, 56224UL, 31577UL, 59453UL, 58137UL, 45395UL, 4874UL,  21229UL, 33396UL, 31545UL, 12066UL, 55777UL,
+    39378UL, 48070UL, 46931UL, 6868UL,  37130UL, 16163UL, 32320UL, 54803UL, 6125UL,  43501UL, 25474UL, 11311UL, 46261UL,
+    35636UL, 555UL,   58387UL, 41808UL, 19301UL, 7665UL,  21688UL, 9333UL,  7569UL,  23226UL, 52022UL, 22468UL, 24631UL,
+    41228UL, 32288UL, 28568UL, 59402UL, 15147UL, 52827UL, 54995UL, 16902UL, 28379UL, 11771UL, 13020UL, 48759UL, 59548UL,
+    15512UL, 6880UL,  65447UL, 19558UL, 3593UL,  42865UL, 60393UL, 294UL,   63554UL, 13884UL, 60481UL, 64691UL, 61860UL,
+    13207UL, 6031UL,  26892UL, 712UL,   9979UL,  31944UL, 37068UL, 56232UL, 11415UL, 38605UL, 55617UL, 42791UL, 38201UL,
+    13367UL, 23477UL, 30857UL, 28194UL, 31801UL, 16864UL, 29776UL, 7325UL,  32697UL, 38074UL, 37141UL, 59671UL, 10413UL,
+    63641UL, 39874UL, 52483UL, 1709UL,  59285UL, 9781UL,  21465UL, 39588UL, 1248UL,  58009UL, 11341UL, 65253UL, 62716UL,
+    24411UL, 55631UL, 42404UL, 13163UL, 6958UL,  21173UL, 16737UL, 44112UL, 63755UL, 4774UL,  6613UL,  60941UL, 56388UL,
+    41818UL, 20583UL, 36449UL, 45162UL, 31903UL, 10481UL, 12281UL, 52045UL, 6002UL,  54215UL, 30025UL, 12373UL, 41682UL,
+    19323UL, 14694UL, 2845UL,  44757UL, 380UL,   50065UL, 45597UL, 58402UL, 54744UL, 59501UL, 58124UL, 47167UL, 19770UL,
+    40848UL, 41789UL, 4506UL,  27824UL, 38623UL, 615UL,   302UL,   31461UL, 44182UL, 19712UL,
+};
+uint32_t rand_arr_17_b17_w32_arr[1024] = {
+    10652UL,  125157UL, 69938UL,  100079UL, 96337UL,  101593UL, 3207UL,   42415UL,  110190UL, 119223UL, 121232UL,
+    116316UL, 97349UL,  44399UL,  67917UL,  71385UL,  67523UL,  32068UL,  93430UL,  62063UL,  52883UL,  47004UL,
+    124551UL, 125400UL, 129679UL, 56874UL,  34633UL,  46087UL,  22625UL,  44680UL,  114096UL, 35828UL,  93133UL,
+    18423UL,  8308UL,   82494UL,  85345UL,  3703UL,   57890UL,  63472UL,  115490UL, 102731UL, 83575UL,  75539UL,
+    22178UL,  920UL,    51100UL,  15672UL,  104473UL, 115632UL, 56355UL,  57244UL,  100152UL, 118392UL, 10346UL,
+    110378UL, 107866UL, 123057UL, 65503UL,  15652UL,  62221UL,  43954UL,  116077UL, 118387UL, 102527UL, 88629UL,
+    88150UL,  54775UL,  58693UL,  38412UL,  5739UL,   130493UL, 71820UL,  123139UL, 13490UL,  89288UL,  88733UL,
+    45161UL,  57858UL,  72422UL,  28064UL,  61742UL,  3654UL,   26057UL,  53324UL,  10015UL,  121422UL, 39023UL,
+    49844UL,  55995UL,  91144UL,  108522UL, 38275UL,  31907UL,  3418UL,   104027UL, 25341UL,  8296UL,   77382UL,
+    60202UL,  33355UL,  36999UL,  121503UL, 8814UL,   18504UL,  10738UL,  90937UL,  23967UL,  121045UL, 77236UL,
+    113026UL, 57201UL,  94651UL,  36148UL,  64545UL,  114659UL, 63735UL,  32950UL,  117990UL, 59160UL,  55979UL,
+    51429UL,  107430UL, 48867UL,  106743UL, 127996UL, 58848UL,  33498UL,  107144UL, 33256UL,  43814UL,  29958UL,
+    34112UL,  52726UL,  83539UL,  52934UL,  105021UL, 22186UL,  11505UL,  12309UL,  85335UL,  81650UL,  115483UL,
+    74037UL,  88716UL,  9297UL,   18090UL,  68760UL,  108691UL, 120410UL, 64980UL,  4779UL,   79214UL,  104416UL,
+    79202UL,  61381UL,  7041UL,   84661UL,  65118UL,  51813UL,  19462UL,  51711UL,  48690UL,  114439UL, 113395UL,
+    25614UL,  58747UL,  78879UL,  57250UL,  63163UL,  53989UL,  38842UL,  124471UL, 32443UL,  110015UL, 29981UL,
+    5901UL,   7557UL,   22995UL,  54983UL,  100523UL, 99906UL,  63649UL,  104483UL, 54475UL,  76167UL,  34854UL,
+    27792UL,  112645UL, 30432UL,  116822UL, 124039UL, 16305UL,  16903UL,  114779UL, 5642UL,   77528UL,  130588UL,
+    87200UL,  7223UL,   13107UL,  102700UL, 19422UL,  53811UL,  39617UL,  93556UL,  92327UL,  71762UL,  102282UL,
+    4266UL,   126928UL, 128852UL, 113258UL, 79013UL,  102281UL, 56066UL,  106938UL, 47669UL,  16147UL,  49467UL,
+    25220UL,  66207UL,  70231UL,  24058UL,  5952UL,   97836UL,  106168UL, 3054UL,   28256UL,  13725UL,  107507UL,
+    28816UL,  59289UL,  93927UL,  59086UL,  114520UL, 41597UL,  30496UL,  121538UL, 64281UL,  62561UL,  91027UL,
+    84762UL,  7694UL,   103203UL, 75911UL,  80272UL,  8904UL,   53753UL,  67994UL,  81490UL,  66043UL,  10370UL,
+    25580UL,  121210UL, 35970UL,  6155UL,   88675UL,  11505UL,  50241UL,  58870UL,  112180UL, 79369UL,  23973UL,
+    120504UL, 62943UL,  58544UL,  108124UL, 125551UL, 70652UL,  97203UL,  128300UL, 122974UL, 104087UL, 125207UL,
+    108797UL, 111815UL, 31808UL,  41407UL,  97843UL,  9046UL,   31389UL,  38634UL,  53692UL,  83531UL,  71080UL,
+    95467UL,  105972UL, 103104UL, 9581UL,   51845UL,  61275UL,  22958UL,  84954UL,  124389UL, 36181UL,  24880UL,
+    79195UL,  92649UL,  98415UL,  70572UL,  124561UL, 28131UL,  41414UL,  76787UL,  34723UL,  47011UL,  63903UL,
+    32540UL,  26738UL,  56978UL,  27413UL,  43326UL,  103401UL, 120262UL, 106060UL, 29430UL,  11829UL,  34461UL,
+    88651UL,  46095UL,  55180UL,  122918UL, 123358UL, 32738UL,  11318UL,  45687UL,  118444UL, 60997UL,  36763UL,
+    50739UL,  107977UL, 102805UL, 112330UL, 119655UL, 107886UL, 5042UL,   80299UL,  128932UL, 129016UL, 40301UL,
+    3145UL,   49520UL,  71877UL,  70066UL,  124255UL, 9665UL,   54748UL,  76851UL,  114391UL, 59549UL,  54032UL,
+    107537UL, 7573UL,   87659UL,  67472UL,  59713UL,  52595UL,  18411UL,  89964UL,  41548UL,  91806UL,  109469UL,
+    91057UL,  1009UL,   76181UL,  58910UL,  61723UL,  12061UL,  107187UL, 110169UL, 16519UL,  111409UL, 4939UL,
+    80387UL,  93231UL,  101550UL, 129048UL, 1151UL,   59565UL,  79929UL,  35500UL,  39019UL,  22474UL,  84194UL,
+    13209UL,  7187UL,   2018UL,   117189UL, 34375UL,  57640UL,  119739UL, 116432UL, 8441UL,   92945UL,  54219UL,
+    35572UL,  57303UL,  45964UL,  57920UL,  19074UL,  2603UL,   47513UL,  49309UL,  73218UL,  119781UL, 122579UL,
+    121669UL, 23615UL,  127525UL, 108519UL, 54208UL,  115168UL, 99228UL,  39954UL,  45834UL,  113610UL, 58282UL,
+    80513UL,  36807UL,  54848UL,  10323UL,  112176UL, 67990UL,  111982UL, 41040UL,  54331UL,  7717UL,   7812UL,
+    34539UL,  109017UL, 69148UL,  42742UL,  33876UL,  16618UL,  43096UL,  74128UL,  47370UL,  83176UL,  46548UL,
+    115291UL, 125622UL, 58040UL,  45289UL,  108571UL, 44753UL,  49261UL,  96671UL,  107762UL, 32517UL,  48235UL,
+    58119UL,  126851UL, 14486UL,  64433UL,  30137UL,  7777UL,   51373UL,  47366UL,  65473UL,  115855UL, 67860UL,
+    95312UL,  96644UL,  79756UL,  46361UL,  59788UL,  52781UL,  60097UL,  122653UL, 78860UL,  92810UL,  45981UL,
+    40954UL,  61238UL,  34655UL,  103285UL, 24335UL,  54682UL,  79932UL,  100862UL, 26458UL,  56279UL,  12178UL,
+    124734UL, 68268UL,  122626UL, 53791UL,  102483UL, 104778UL, 32195UL,  60642UL,  77346UL,  38484UL,  100926UL,
+    112109UL, 42560UL,  114037UL, 954UL,    20736UL,  60589UL,  30115UL,  36830UL,  41287UL,  108007UL, 545UL,
+    61663UL,  39781UL,  63137UL,  59254UL,  63657UL,  11033UL,  12615UL,  58137UL,  111757UL, 94041UL,  70048UL,
+    98248UL,  112725UL, 105201UL, 88095UL,  91768UL,  49720UL,  95224UL,  13284UL,  104577UL, 88592UL,  116313UL,
+    27034UL,  74770UL,  112860UL, 103942UL, 63922UL,  29177UL,  59788UL,  78562UL,  44528UL,  191UL,    49248UL,
+    29353UL,  6091UL,   102706UL, 33112UL,  96009UL,  75573UL,  123210UL, 22600UL,  120194UL, 12100UL,  28680UL,
+    46090UL,  61108UL,  74855UL,  8163UL,   90459UL,  55723UL,  120336UL, 15199UL,  25798UL,  76949UL,  35879UL,
+    19977UL,  60204UL,  28248UL,  103153UL, 72213UL,  17175UL,  37020UL,  21264UL,  110825UL, 70615UL,  89132UL,
+    51263UL,  38114UL,  52017UL,  90118UL,  83209UL,  11673UL,  121752UL, 75189UL,  1413UL,   120917UL, 24706UL,
+    119586UL, 43837UL,  105595UL, 67645UL,  82058UL,  33908UL,  51773UL,  609UL,    120103UL, 48769UL,  107262UL,
+    122516UL, 36347UL,  87096UL,  39653UL,  14687UL,  65619UL,  107149UL, 111048UL, 4600UL,   113897UL, 114662UL,
+    126585UL, 83275UL,  117990UL, 31947UL,  19596UL,  79014UL,  41152UL,  117665UL, 45845UL,  107069UL, 103219UL,
+    108810UL, 115221UL, 13641UL,  18387UL,  27941UL,  70430UL,  103274UL, 69705UL,  50879UL,  65242UL,  54070UL,
+    119412UL, 73875UL,  33193UL,  19728UL,  27655UL,  32081UL,  58248UL,  32233UL,  48250UL,  68498UL,  90207UL,
+    110218UL, 33927UL,  56874UL,  64423UL,  61263UL,  47271UL,  5420UL,   79554UL,  86692UL,  123395UL, 42486UL,
+    48875UL,  123081UL, 19444UL,  2239UL,   118188UL, 53642UL,  75832UL,  109196UL, 10753UL,  58626UL,  62329UL,
+    8506UL,   65317UL,  72303UL,  89368UL,  40508UL,  80862UL,  39613UL,  113408UL, 23233UL,  76304UL,  58597UL,
+    104097UL, 85957UL,  93443UL,  119091UL, 83774UL,  92837UL,  48471UL,  95653UL,  16842UL,  16996UL,  100206UL,
+    58459UL,  102457UL, 20748UL,  87690UL,  102885UL, 5496UL,   7355UL,   123910UL, 35465UL,  126857UL, 63638UL,
+    31708UL,  112244UL, 38267UL,  15830UL,  72265UL,  113808UL, 37845UL,  19776UL,  90044UL,  124645UL, 4433UL,
+    12868UL,  25766UL,  81285UL,  39552UL,  109529UL, 53801UL,  64561UL,  56606UL,  43586UL,  99620UL,  109932UL,
+    4003UL,   32637UL,  9863UL,   44634UL,  83034UL,  10938UL,  637UL,    36243UL,  37464UL,  94324UL,  123018UL,
+    89774UL,  92950UL,  40989UL,  124462UL, 127594UL, 1854UL,   104592UL, 126538UL, 55955UL,  109557UL, 80562UL,
+    61087UL,  40672UL,  74165UL,  28328UL,  85320UL,  38784UL,  90149UL,  69218UL,  10448UL,  20374UL,  110366UL,
+    3650UL,   64620UL,  20760UL,  33265UL,  37027UL,  63390UL,  121449UL, 72560UL,  3798UL,   117986UL, 73534UL,
+    63434UL,  36135UL,  94028UL,  26811UL,  26912UL,  128245UL, 15149UL,  116312UL, 21112UL,  130201UL, 64416UL,
+    8543UL,   79881UL,  101790UL, 1683UL,   95200UL,  68598UL,  65948UL,  7827UL,   16251UL,  6513UL,   11376UL,
+    100485UL, 54998UL,  47881UL,  9248UL,   102875UL, 69553UL,  122168UL, 89321UL,  124424UL, 98518UL,  41581UL,
+    102117UL, 89025UL,  114720UL, 77982UL,  112343UL, 111270UL, 109809UL, 61912UL,  11893UL,  125868UL, 130880UL,
+    124280UL, 101992UL, 38629UL,  109581UL, 4739UL,   12584UL,  49971UL,  112763UL, 16125UL,  4469UL,   2171UL,
+    77299UL,  18422UL,  121986UL, 65724UL,  121239UL, 93286UL,  82140UL,  116629UL, 14647UL,  98930UL,  122394UL,
+    177UL,    66761UL,  101331UL, 63861UL,  109372UL, 5567UL,   19756UL,  61743UL,  103989UL, 80016UL,  66930UL,
+    6080UL,   34724UL,  94185UL,  59154UL,  99115UL,  118393UL, 1207UL,   78644UL,  48405UL,  4578UL,   98260UL,
+    2291UL,   98700UL,  63793UL,  92286UL,  67049UL,  107318UL, 104115UL, 77434UL,  69808UL,  118875UL, 97338UL,
+    102905UL, 2317UL,   23740UL,  33139UL,  48849UL,  115753UL, 82547UL,  64497UL,  49176UL,  107269UL, 63190UL,
+    114318UL, 14528UL,  20854UL,  102878UL, 42692UL,  33874UL,  37434UL,  50701UL,  63548UL,  60695UL,  31418UL,
+    55242UL,  46508UL,  83327UL,  75852UL,  58874UL,  62971UL,  116216UL, 41646UL,  113105UL, 72496UL,  20268UL,
+    46649UL,  129292UL, 3101UL,   128096UL, 58441UL,  47217UL,  69651UL,  4503UL,   54932UL,  97051UL,  14153UL,
+    7341UL,   59952UL,  92531UL,  120344UL, 83472UL,  122213UL, 94969UL,  31715UL,  122110UL, 65652UL,  41685UL,
+    113872UL, 65464UL,  122513UL, 28431UL,  49923UL,  86960UL,  87710UL,  48682UL,  101382UL, 114390UL, 105250UL,
+    79689UL,  118678UL, 20891UL,  119244UL, 62446UL,  14981UL,  81538UL,  2594UL,   33670UL,  74101UL,  113694UL,
+    62534UL,  31549UL,  51771UL,  67514UL,  89189UL,  121700UL, 103918UL, 103867UL, 20263UL,  14887UL,  114967UL,
+    80371UL,  47616UL,  8518UL,   92761UL,  68971UL,  2919UL,   78643UL,  107339UL, 81826UL,  37137UL,  105935UL,
+    11584UL,  77343UL,  124349UL, 44253UL,  15471UL,  67271UL,  42798UL,  12272UL,  42425UL,  26321UL,  60965UL,
+    101381UL, 76395UL,  45851UL,  26008UL,  92130UL,  60002UL,  25895UL,  30236UL,  121314UL, 46852UL,  68588UL,
+    36828UL,  47489UL,  113751UL, 29494UL,  102626UL, 124386UL, 126067UL, 106592UL, 83625UL,  3531UL,   5079UL,
+    58520UL,  40993UL,  31635UL,  96722UL,  74294UL,  16988UL,  7695UL,   25085UL,  22936UL,  73582UL,  63511UL,
+    50281UL,  81349UL,  104099UL, 71018UL,  46912UL,  27208UL,  114863UL, 71545UL,  107948UL, 14450UL,  53240UL,
+    25753UL,  6559UL,   14431UL,  90379UL,  112001UL, 42095UL,  95582UL,  112606UL, 47582UL,  89122UL,  60459UL,
+    20613UL,
+};
+uint32_t rand_arr_18_b18_w32_arr[1024] = {
+    249538UL, 16085UL,  196470UL, 250143UL, 101133UL, 148943UL, 119325UL, 147271UL, 44588UL,  168667UL, 164775UL,
+    83909UL,  152158UL, 135209UL, 111970UL, 75451UL,  62875UL,  129845UL, 54991UL,  130079UL, 251250UL, 202931UL,
+    80141UL,  251113UL, 159558UL, 256880UL, 214606UL, 48942UL,  209505UL, 33696UL,  117307UL, 174275UL, 149022UL,
+    223192UL, 5247UL,   212927UL, 109086UL, 183464UL, 20760UL,  171307UL, 209049UL, 129752UL, 73617UL,  252693UL,
+    190183UL, 217096UL, 125868UL, 157601UL, 107116UL, 262107UL, 97665UL,  13659UL,  234673UL, 259737UL, 108486UL,
+    217602UL, 19431UL,  220679UL, 36557UL,  193388UL, 161580UL, 233525UL, 233213UL, 256181UL, 148390UL, 203398UL,
+    35520UL,  162602UL, 59689UL,  219016UL, 149239UL, 186684UL, 100047UL, 35023UL,  157351UL, 291UL,    161496UL,
+    96852UL,  13451UL,  174897UL, 65171UL,  83162UL,  126120UL, 38049UL,  110843UL, 70631UL,  76927UL,  51392UL,
+    94092UL,  104840UL, 169067UL, 93569UL,  241409UL, 95175UL,  209835UL, 195246UL, 68122UL,  184764UL, 107002UL,
+    49289UL,  242833UL, 146527UL, 2369UL,   206563UL, 256346UL, 183236UL, 246080UL, 233149UL, 58036UL,  32045UL,
+    189551UL, 140626UL, 104041UL, 228929UL, 141836UL, 56285UL,  34175UL,  220755UL, 32832UL,  150725UL, 48909UL,
+    7027UL,   159661UL, 165124UL, 201856UL, 118194UL, 144601UL, 152630UL, 121028UL, 203289UL, 117503UL, 209293UL,
+    197799UL, 71850UL,  186914UL, 31150UL,  246322UL, 29109UL,  33911UL,  251726UL, 181285UL, 222996UL, 26518UL,
+    192526UL, 74295UL,  9203UL,   184723UL, 42274UL,  88614UL,  75417UL,  55994UL,  160963UL, 172608UL, 162146UL,
+    140763UL, 167749UL, 156540UL, 67646UL,  119142UL, 62552UL,  68932UL,  173079UL, 158118UL, 93147UL,  94813UL,
+    48221UL,  201399UL, 43340UL,  167325UL, 222914UL, 67625UL,  167007UL, 67087UL,  3956UL,   76034UL,  82382UL,
+    14341UL,  182057UL, 165412UL, 217986UL, 227656UL, 116535UL, 226000UL, 4568UL,   138879UL, 125137UL, 45326UL,
+    48678UL,  153642UL, 150292UL, 78146UL,  165070UL, 254183UL, 182890UL, 21904UL,  31311UL,  61866UL,  172683UL,
+    250512UL, 160114UL, 195773UL, 228403UL, 239148UL, 95688UL,  114630UL, 22550UL,  24638UL,  9621UL,   49517UL,
+    203244UL, 190408UL, 140370UL, 31039UL,  231091UL, 230487UL, 217991UL, 209691UL, 90960UL,  191940UL, 260118UL,
+    170188UL, 10663UL,  226093UL, 144708UL, 26510UL,  233841UL, 198308UL, 103333UL, 137061UL, 183644UL, 224029UL,
+    27087UL,  95206UL,  60264UL,  221816UL, 217749UL, 16417UL,  18407UL,  241902UL, 178496UL, 67783UL,  107532UL,
+    81341UL,  68716UL,  211944UL, 249371UL, 16248UL,  140908UL, 200019UL, 208181UL, 190465UL, 191984UL, 105227UL,
+    84071UL,  48624UL,  181507UL, 46055UL,  49076UL,  164839UL, 144036UL, 46849UL,  185393UL, 226605UL, 247217UL,
+    15727UL,  68060UL,  97223UL,  141726UL, 228983UL, 116784UL, 234447UL, 242530UL, 80509UL,  72005UL,  119802UL,
+    6752UL,   13588UL,  40769UL,  72607UL,  206634UL, 245450UL, 126476UL, 830UL,    167600UL, 92739UL,  30852UL,
+    187770UL, 224819UL, 611UL,    10260UL,  48009UL,  59010UL,  192055UL, 146666UL, 79118UL,  37109UL,  24231UL,
+    11704UL,  233636UL, 19066UL,  61401UL,  172770UL, 238923UL, 182467UL, 219916UL, 203506UL, 66013UL,  229112UL,
+    131136UL, 21163UL,  195150UL, 19783UL,  65794UL,  37037UL,  64311UL,  136395UL, 125725UL, 142985UL, 182326UL,
+    29524UL,  53497UL,  68851UL,  240907UL, 51869UL,  153942UL, 28231UL,  99313UL,  67054UL,  122399UL, 134078UL,
+    51342UL,  259135UL, 226814UL, 145103UL, 193659UL, 93323UL,  9149UL,   151857UL, 175762UL, 168369UL, 255744UL,
+    151849UL, 21059UL,  203412UL, 210161UL, 15253UL,  68268UL,  117995UL, 196361UL, 19021UL,  120746UL, 146256UL,
+    204132UL, 95283UL,  22864UL,  64926UL,  101171UL, 108751UL, 172153UL, 258880UL, 56667UL,  188101UL, 141196UL,
+    190136UL, 258903UL, 74769UL,  23245UL,  91564UL,  27231UL,  107366UL, 70296UL,  46085UL,  128177UL, 95914UL,
+    161109UL, 93172UL,  42455UL,  113893UL, 31854UL,  220921UL, 261699UL, 77662UL,  99907UL,  162592UL, 217758UL,
+    217086UL, 101701UL, 94731UL,  103142UL, 78357UL,  162323UL, 178088UL, 32541UL,  232781UL, 165654UL, 13786UL,
+    247070UL, 7189UL,   141559UL, 10536UL,  102944UL, 95007UL,  70653UL,  96164UL,  82452UL,  248900UL, 109194UL,
+    247551UL, 258955UL, 189243UL, 147379UL, 151184UL, 79107UL,  5480UL,   250816UL, 95847UL,  187067UL, 21722UL,
+    86279UL,  210890UL, 211233UL, 181986UL, 237523UL, 126485UL, 108668UL, 136390UL, 236747UL, 48579UL,  102297UL,
+    77644UL,  190495UL, 168822UL, 28496UL,  88566UL,  69923UL,  129553UL, 221899UL, 147476UL, 170294UL, 247756UL,
+    229691UL, 76119UL,  104282UL, 17511UL,  94778UL,  56182UL,  244515UL, 196309UL, 118002UL, 225591UL, 147287UL,
+    193112UL, 243519UL, 139486UL, 225502UL, 75905UL,  2933UL,   229337UL, 196644UL, 85512UL,  60090UL,  155964UL,
+    201533UL, 47589UL,  165716UL, 124829UL, 74773UL,  21672UL,  10080UL,  115948UL, 5120UL,   169322UL, 1489UL,
+    45714UL,  239010UL, 234224UL, 37307UL,  25530UL,  54026UL,  75157UL,  6086UL,   9818UL,   15524UL,  84865UL,
+    186122UL, 169080UL, 123822UL, 220148UL, 53133UL,  245856UL, 91494UL,  92878UL,  96279UL,  58765UL,  119457UL,
+    85927UL,  116425UL, 186508UL, 218252UL, 4198UL,   159910UL, 200596UL, 167698UL, 36420UL,  767UL,    259604UL,
+    11542UL,  90060UL,  121685UL, 23455UL,  56568UL,  240482UL, 160615UL, 217184UL, 261832UL, 190249UL, 181003UL,
+    232661UL, 211227UL, 262002UL, 216833UL, 130304UL, 39156UL,  66132UL,  121624UL, 247345UL, 63882UL,  20597UL,
+    3520UL,   10884UL,  18008UL,  14891UL,  261610UL, 12246UL,  91029UL,  134629UL, 239944UL, 121272UL, 54469UL,
+    188968UL, 186552UL, 148280UL, 258751UL, 101098UL, 186550UL, 174055UL, 124260UL, 130173UL, 72526UL,  93783UL,
+    144907UL, 183450UL, 73319UL,  8820UL,   69328UL,  245217UL, 237371UL, 99369UL,  32909UL,  181333UL, 175831UL,
+    195202UL, 76430UL,  152793UL, 93743UL,  181145UL, 214136UL, 194106UL, 136680UL, 202769UL, 88672UL,  210053UL,
+    123617UL, 216347UL, 251509UL, 206005UL, 82078UL,  192254UL, 222611UL, 195263UL, 17001UL,  17000UL,  84344UL,
+    126872UL, 151366UL, 261106UL, 67214UL,  82879UL,  45448UL,  110930UL, 37126UL,  236386UL, 231658UL, 143449UL,
+    27458UL,  235983UL, 160150UL, 1580UL,   46629UL,  41369UL,  82228UL,  20654UL,  169243UL, 183665UL, 246190UL,
+    154482UL, 131949UL, 105136UL, 32922UL,  203608UL, 125838UL, 205515UL, 22039UL,  161821UL, 91124UL,  79033UL,
+    109650UL, 52186UL,  218284UL, 155654UL, 187053UL, 39520UL,  127392UL, 96497UL,  26483UL,  175077UL, 149046UL,
+    95076UL,  252347UL, 198050UL, 145349UL, 23920UL,  30066UL,  18518UL,  89201UL,  15613UL,  79808UL,  61076UL,
+    163768UL, 64173UL,  37612UL,  105443UL, 122816UL, 93000UL,  80210UL,  204364UL, 68653UL,  134504UL, 142352UL,
+    202520UL, 20701UL,  252879UL, 7226UL,   257999UL, 144277UL, 17407UL,  186300UL, 59130UL,  127988UL, 201484UL,
+    21176UL,  220923UL, 157405UL, 11506UL,  203385UL, 176465UL, 209332UL, 189012UL, 92134UL,  256053UL, 255244UL,
+    157203UL, 223223UL, 195272UL, 139013UL, 169851UL, 188771UL, 35904UL,  14783UL,  192460UL, 136242UL, 230745UL,
+    101166UL, 238358UL, 157438UL, 220387UL, 25439UL,  149292UL, 98104UL,  168274UL, 73321UL,  218285UL, 239184UL,
+    129935UL, 994UL,    201064UL, 257525UL, 260380UL, 31198UL,  132987UL, 79768UL,  147714UL, 110480UL, 4627UL,
+    48237UL,  19346UL,  209841UL, 62867UL,  87760UL,  146220UL, 192718UL, 108457UL, 51599UL,  78180UL,  174208UL,
+    6429UL,   166378UL, 246169UL, 46026UL,  10625UL,  105367UL, 213814UL, 154225UL, 241538UL, 60981UL,  131874UL,
+    187681UL, 37089UL,  197142UL, 116547UL, 121479UL, 4495UL,   105700UL, 144829UL, 52256UL,  202795UL, 138780UL,
+    241158UL, 87831UL,  41155UL,  79158UL,  52706UL,  153092UL, 186157UL, 229552UL, 223171UL, 162575UL, 170803UL,
+    123342UL, 160355UL, 221169UL, 140034UL, 57836UL,  239642UL, 137680UL, 257855UL, 215630UL, 137949UL, 2870UL,
+    184518UL, 221530UL, 89072UL,  31747UL,  66075UL,  15974UL,  69901UL,  155524UL, 47146UL,  91733UL,  104708UL,
+    132940UL, 21505UL,  147323UL, 197726UL, 174378UL, 204762UL, 19500UL,  7026UL,   124030UL, 96075UL,  75600UL,
+    249455UL, 76754UL,  97717UL,  157471UL, 215199UL, 191057UL, 143323UL, 32661UL,  196690UL, 87445UL,  69651UL,
+    63761UL,  108316UL, 207082UL, 108393UL, 105270UL, 28324UL,  74941UL,  33771UL,  41134UL,  222996UL, 144405UL,
+    191198UL, 93879UL,  254103UL, 75798UL,  67680UL,  224809UL, 247184UL, 259543UL, 30907UL,  34316UL,  226572UL,
+    255019UL, 80187UL,  251941UL, 124960UL, 32144UL,  135934UL, 164163UL, 213325UL, 258286UL, 86546UL,  212702UL,
+    183886UL, 53962UL,  145488UL, 197239UL, 96703UL,  180573UL, 227607UL, 260448UL, 211243UL, 211244UL, 112967UL,
+    136814UL, 103876UL, 215447UL, 188738UL, 109737UL, 109389UL, 23481UL,  207749UL, 64819UL,  190868UL, 22040UL,
+    114386UL, 35283UL,  56295UL,  107233UL, 163868UL, 254001UL, 182034UL, 80409UL,  96356UL,  37403UL,  67800UL,
+    210885UL, 159316UL, 55128UL,  236608UL, 37132UL,  49127UL,  180995UL, 80835UL,  53320UL,  174907UL, 247739UL,
+    232682UL, 217484UL, 80296UL,  5587UL,   228911UL, 216676UL, 100021UL, 141178UL, 98682UL,  44884UL,  121574UL,
+    122169UL, 17382UL,  3459UL,   216257UL, 220773UL, 255465UL, 8103UL,   133987UL, 113663UL, 244880UL, 230642UL,
+    57582UL,  22905UL,  10778UL,  239536UL, 70219UL,  8575UL,   97735UL,  235174UL, 102480UL, 139625UL, 133105UL,
+    211304UL, 83684UL,  242469UL, 60373UL,  209523UL, 23144UL,  163851UL, 43839UL,  27795UL,  90113UL,  159591UL,
+    56406UL,  119088UL, 33854UL,  230937UL, 155179UL, 10247UL,  41861UL,  126903UL, 85937UL,  78867UL,  71724UL,
+    7762UL,   201552UL, 225830UL, 252978UL, 223004UL, 195747UL, 83976UL,  116251UL, 69999UL,  136744UL, 75557UL,
+    107875UL, 80910UL,  178657UL, 28071UL,  129684UL, 33403UL,  254665UL, 250270UL, 16633UL,  228300UL, 218121UL,
+    71615UL,  52522UL,  239784UL, 66260UL,  79295UL,  221500UL, 86735UL,  118008UL, 64273UL,  37995UL,  75677UL,
+    179260UL, 149209UL, 217345UL, 247622UL, 223658UL, 14419UL,  196522UL, 159401UL, 21740UL,  237110UL, 237588UL,
+    49252UL,  124564UL, 63445UL,  109816UL, 232864UL, 121354UL, 18728UL,  171817UL, 12121UL,  31369UL,  107742UL,
+    243852UL, 60546UL,  166967UL, 53994UL,  43516UL,  51096UL,  38059UL,  78641UL,  207594UL, 214935UL, 57442UL,
+    168805UL, 257192UL, 120404UL, 119256UL, 122512UL, 29816UL,  146885UL, 261151UL, 26722UL,  255943UL, 16671UL,
+    159694UL, 92079UL,  219146UL, 125553UL, 139517UL, 65881UL,  196313UL, 54257UL,  110776UL, 258066UL, 985UL,
+    139508UL, 167908UL, 151203UL, 122292UL, 154761UL, 220260UL, 112783UL, 259407UL, 181244UL, 195294UL, 17856UL,
+    209590UL,
+};
+uint32_t rand_arr_19_b19_w32_arr[1024] = {
+    90286UL,  465258UL, 113249UL, 316145UL, 373213UL, 84812UL,  225224UL, 273562UL, 60982UL,  366485UL, 76766UL,
+    93252UL,  279846UL, 25574UL,  45119UL,  231434UL, 62543UL,  158585UL, 108572UL, 47547UL,  191485UL, 288513UL,
+    327055UL, 160430UL, 6409UL,   80285UL,  93585UL,  457602UL, 389382UL, 131424UL, 441255UL, 388273UL, 305770UL,
+    39097UL,  248479UL, 29263UL,  63182UL,  63839UL,  245613UL, 340319UL, 384792UL, 394587UL, 521882UL, 491750UL,
+    194578UL, 386640UL, 53455UL,  60597UL,  455691UL, 342525UL, 49591UL,  329978UL, 194579UL, 116UL,    476271UL,
+    137136UL, 423644UL, 106334UL, 124164UL, 257784UL, 84715UL,  356299UL, 312869UL, 252820UL, 493886UL, 221338UL,
+    333783UL, 516159UL, 47853UL,  28570UL,  306654UL, 422165UL, 284176UL, 257963UL, 242877UL, 147037UL, 390195UL,
+    181703UL, 475381UL, 15934UL,  231519UL, 318547UL, 440052UL, 150654UL, 98683UL,  320515UL, 58536UL,  196689UL,
+    478017UL, 152227UL, 388405UL, 432581UL, 22847UL,  197614UL, 302563UL, 21392UL,  277432UL, 75931UL,  206226UL,
+    222010UL, 382315UL, 345803UL, 467704UL, 397808UL, 385358UL, 329848UL, 420650UL, 85644UL,  337728UL, 508857UL,
+    439832UL, 260976UL, 384308UL, 366718UL, 111248UL, 442861UL, 141233UL, 105697UL, 361112UL, 113376UL, 43420UL,
+    358124UL, 54171UL,  173434UL, 460062UL, 277360UL, 285468UL, 273912UL, 68475UL,  318557UL, 52757UL,  239219UL,
+    302014UL, 262854UL, 444713UL, 143512UL, 366573UL, 8807UL,   344731UL, 174544UL, 515070UL, 175730UL, 190199UL,
+    398554UL, 392544UL, 230466UL, 41404UL,  302993UL, 208961UL, 386575UL, 269865UL, 495993UL, 502155UL, 494074UL,
+    117620UL, 477329UL, 147300UL, 382443UL, 368670UL, 289317UL, 310805UL, 254668UL, 437471UL, 235972UL, 430841UL,
+    92679UL,  22920UL,  169858UL, 395618UL, 276116UL, 444085UL, 51944UL,  127597UL, 465644UL, 231375UL, 410058UL,
+    18200UL,  427025UL, 140915UL, 379112UL, 77125UL,  33161UL,  231047UL, 500079UL, 70311UL,  462640UL, 20271UL,
+    314671UL, 442429UL, 487030UL, 372422UL, 306763UL, 469433UL, 22591UL,  446142UL, 389658UL, 467092UL, 437947UL,
+    133317UL, 40376UL,  27455UL,  417259UL, 280724UL, 126552UL, 181975UL, 278946UL, 117708UL, 16691UL,  103466UL,
+    430047UL, 268497UL, 121961UL, 364309UL, 496942UL, 16652UL,  78359UL,  280928UL, 406284UL, 299677UL, 494908UL,
+    59760UL,  440742UL, 107293UL, 70315UL,  449135UL, 211136UL, 196852UL, 290165UL, 294584UL, 145882UL, 15649UL,
+    272409UL, 69383UL,  161173UL, 515708UL, 97282UL,  446674UL, 282208UL, 263479UL, 409055UL, 127244UL, 31973UL,
+    205500UL, 346905UL, 446582UL, 115611UL, 86689UL,  283057UL, 309711UL, 492137UL, 477820UL, 394543UL, 125777UL,
+    515940UL, 178272UL, 120655UL, 241808UL, 156601UL, 216014UL, 212808UL, 254547UL, 411572UL, 246666UL, 157828UL,
+    495232UL, 299765UL, 251110UL, 413186UL, 498886UL, 294094UL, 246957UL, 104889UL, 433214UL, 357465UL, 130437UL,
+    434812UL, 460021UL, 172596UL, 197566UL, 401105UL, 217188UL, 127794UL, 75742UL,  248833UL, 322021UL, 42652UL,
+    164714UL, 199393UL, 19731UL,  244321UL, 454533UL, 393418UL, 110947UL, 335612UL, 254297UL, 507561UL, 315684UL,
+    230807UL, 13331UL,  266631UL, 244832UL, 495203UL, 205152UL, 177301UL, 2640UL,   499083UL, 222210UL, 210821UL,
+    357537UL, 13926UL,  236871UL, 316574UL, 248926UL, 393613UL, 350197UL, 452241UL, 132153UL, 75074UL,  76038UL,
+    255124UL, 117674UL, 94073UL,  486874UL, 489327UL, 354577UL, 420096UL, 57401UL,  347935UL, 41152UL,  351229UL,
+    4763UL,   8350UL,   411935UL, 35055UL,  413674UL, 355821UL, 347829UL, 476095UL, 317528UL, 33442UL,  121328UL,
+    259077UL, 185892UL, 519944UL, 258232UL, 286744UL, 272928UL, 390475UL, 274849UL, 431825UL, 523994UL, 286197UL,
+    355835UL, 453364UL, 249150UL, 205779UL, 500522UL, 135950UL, 266362UL, 141986UL, 97650UL,  336349UL, 516334UL,
+    488670UL, 417064UL, 430241UL, 280959UL, 52403UL,  48932UL,  64707UL,  228877UL, 309349UL, 233423UL, 463352UL,
+    36041UL,  509088UL, 201081UL, 351839UL, 425742UL, 215908UL, 406536UL, 348629UL, 151665UL, 415178UL, 488039UL,
+    159264UL, 57607UL,  302125UL, 140030UL, 111067UL, 411570UL, 324933UL, 331285UL, 521714UL, 458520UL, 364725UL,
+    229367UL, 408770UL, 171411UL, 125366UL, 271520UL, 29153UL,  496491UL, 506527UL, 40269UL,  320386UL, 370707UL,
+    241666UL, 25859UL,  484854UL, 42926UL,  1637UL,   481395UL, 490337UL, 519680UL, 347740UL, 277234UL, 459803UL,
+    111851UL, 460979UL, 498778UL, 445711UL, 343391UL, 309459UL, 287313UL, 289674UL, 381455UL, 392557UL, 20114UL,
+    109063UL, 287354UL, 149125UL, 162462UL, 289260UL, 117732UL, 124236UL, 110698UL, 388835UL, 230562UL, 252482UL,
+    72265UL,  203642UL, 128861UL, 263843UL, 140549UL, 432660UL, 275778UL, 131411UL, 251915UL, 390730UL, 410168UL,
+    47253UL,  511322UL, 130631UL, 145959UL, 380276UL, 211071UL, 233788UL, 334755UL, 419436UL, 255537UL, 32231UL,
+    289110UL, 174856UL, 451889UL, 431347UL, 187165UL, 376200UL, 104207UL, 267369UL, 256814UL, 212903UL, 284733UL,
+    467242UL, 8434UL,   224055UL, 264423UL, 191603UL, 501409UL, 41249UL,  436685UL, 464248UL, 422398UL, 273645UL,
+    55150UL,  157442UL, 197890UL, 97099UL,  500793UL, 500479UL, 240189UL, 246233UL, 49162UL,  41951UL,  354360UL,
+    412815UL, 435155UL, 265862UL, 512051UL, 326874UL, 226216UL, 236390UL, 432438UL, 440349UL, 121066UL, 165925UL,
+    181332UL, 49624UL,  450294UL, 363172UL, 143684UL, 505893UL, 245028UL, 357910UL, 264137UL, 265520UL, 408788UL,
+    453247UL, 192990UL, 427525UL, 359359UL, 471444UL, 91347UL,  476508UL, 313507UL, 510896UL, 445990UL, 516587UL,
+    489472UL, 208959UL, 355951UL, 269949UL, 87155UL,  1619UL,   393303UL, 440651UL, 423143UL, 410926UL, 370830UL,
+    371961UL, 81349UL,  507180UL, 50868UL,  459293UL, 249729UL, 414705UL, 485891UL, 175763UL, 206614UL, 239183UL,
+    378349UL, 414211UL, 346288UL, 445900UL, 7423UL,   137059UL, 223464UL, 301251UL, 248391UL, 102717UL, 342273UL,
+    131772UL, 171725UL, 500837UL, 194524UL, 441565UL, 260903UL, 168551UL, 173772UL, 267972UL, 195761UL, 44505UL,
+    205646UL, 445005UL, 395717UL, 252370UL, 79703UL,  283940UL, 194938UL, 348985UL, 29915UL,  483240UL, 391275UL,
+    8528UL,   79779UL,  123491UL, 364105UL, 47025UL,  434365UL, 427453UL, 386140UL, 74436UL,  439563UL, 440937UL,
+    36491UL,  314772UL, 45163UL,  178601UL, 116098UL, 169670UL, 271808UL, 67486UL,  105692UL, 53369UL,  18729UL,
+    297985UL, 487708UL, 235758UL, 269421UL, 238347UL, 215129UL, 433237UL, 82870UL,  475627UL, 328965UL, 199808UL,
+    373368UL, 236769UL, 519930UL, 225569UL, 113421UL, 237270UL, 56447UL,  226848UL, 414461UL, 463201UL, 85095UL,
+    357856UL, 253612UL, 348753UL, 373618UL, 417374UL, 367151UL, 318602UL, 185262UL, 279140UL, 3367UL,   73843UL,
+    377761UL, 106860UL, 243942UL, 198883UL, 211116UL, 7682UL,   243782UL, 205292UL, 158412UL, 112546UL, 501123UL,
+    181724UL, 104495UL, 334825UL, 13356UL,  387490UL, 278414UL, 203093UL, 522403UL, 420839UL, 107832UL, 321260UL,
+    143007UL, 98033UL,  339182UL, 453484UL, 31367UL,  193794UL, 333622UL, 22092UL,  281816UL, 104655UL, 22456UL,
+    294539UL, 511852UL, 12681UL,  440414UL, 48106UL,  370285UL, 414580UL, 455459UL, 322937UL, 321403UL, 88158UL,
+    202122UL, 112530UL, 346564UL, 179106UL, 300436UL, 474364UL, 224343UL, 239778UL, 95703UL,  252549UL, 400471UL,
+    177231UL, 28246UL,  521259UL, 509064UL, 337019UL, 379599UL, 243038UL, 40388UL,  521601UL, 221473UL, 249733UL,
+    292362UL, 223872UL, 443357UL, 294940UL, 135623UL, 239777UL, 225676UL, 453188UL, 323552UL, 123821UL, 50448UL,
+    354062UL, 520523UL, 43123UL,  136291UL, 441603UL, 4122UL,   97906UL,  167092UL, 283278UL, 63258UL,  407906UL,
+    194744UL, 447673UL, 459035UL, 307532UL, 121252UL, 504910UL, 320183UL, 398398UL, 162895UL, 356403UL, 319133UL,
+    83631UL,  413794UL, 472939UL, 155369UL, 500827UL, 16341UL,  481104UL, 440309UL, 409646UL, 101525UL, 207671UL,
+    223684UL, 52050UL,  323918UL, 41639UL,  357334UL, 473522UL, 224297UL, 210715UL, 213228UL, 244965UL, 475101UL,
+    243492UL, 135484UL, 281730UL, 325125UL, 246172UL, 451169UL, 437225UL, 517874UL, 164703UL, 501679UL, 34478UL,
+    56198UL,  45861UL,  85985UL,  84339UL,  255715UL, 182843UL, 141895UL, 370543UL, 260339UL, 185303UL, 431912UL,
+    367195UL, 378094UL, 485674UL, 43166UL,  440553UL, 271606UL, 415708UL, 470316UL, 363982UL, 162956UL, 241724UL,
+    333776UL, 475127UL, 367769UL, 329601UL, 168312UL, 319316UL, 268110UL, 220570UL, 200012UL, 321698UL, 523083UL,
+    397027UL, 137773UL, 245244UL, 299302UL, 6364UL,   357994UL, 73620UL,  461000UL, 423280UL, 268513UL, 69407UL,
+    338962UL, 320299UL, 108596UL, 198640UL, 208104UL, 348737UL, 253168UL, 50215UL,  323569UL, 154824UL, 473689UL,
+    263595UL, 32442UL,  67326UL,  212679UL, 136153UL, 480299UL, 271317UL, 104UL,    14011UL,  421041UL, 36890UL,
+    342958UL, 452820UL, 281262UL, 193693UL, 373230UL, 238663UL, 92592UL,  121545UL, 368108UL, 364693UL, 193947UL,
+    425018UL, 393471UL, 164598UL, 117086UL, 124478UL, 88175UL,  498503UL, 218908UL, 66054UL,  22458UL,  508971UL,
+    268838UL, 454698UL, 295035UL, 187652UL, 248763UL, 515675UL, 425979UL, 457458UL, 394374UL, 173700UL, 163969UL,
+    440314UL, 503500UL, 206972UL, 365790UL, 471130UL, 206382UL, 71695UL,  297277UL, 67737UL,  328303UL, 293159UL,
+    378365UL, 408283UL, 75346UL,  227296UL, 241597UL, 52660UL,  519387UL, 149057UL, 478447UL, 212766UL, 223433UL,
+    172426UL, 369431UL, 306148UL, 118458UL, 330219UL, 194565UL, 55427UL,  324867UL, 293470UL, 1040UL,   422766UL,
+    137912UL, 447686UL, 186776UL, 401921UL, 454715UL, 500395UL, 81192UL,  315946UL, 428066UL, 490004UL, 363659UL,
+    16709UL,  440684UL, 43744UL,  436935UL, 304655UL, 69146UL,  201290UL, 267747UL, 460481UL, 343304UL, 38461UL,
+    225228UL, 117084UL, 354043UL, 87262UL,  23156UL,  69134UL,  381033UL, 33271UL,  483238UL, 111799UL, 341288UL,
+    418554UL, 7587UL,   478503UL, 490128UL, 185546UL, 427250UL, 256625UL, 472810UL, 474026UL, 367664UL, 283661UL,
+    159670UL, 501977UL, 305930UL, 330211UL, 191049UL, 27550UL,  354766UL, 330523UL, 241796UL, 243018UL, 59179UL,
+    69743UL,  436259UL, 395677UL, 37579UL,  89633UL,  276833UL, 520931UL, 311061UL, 216084UL, 35766UL,  115300UL,
+    494278UL, 88390UL,  321971UL, 408828UL, 459075UL, 145949UL, 51542UL,  140106UL, 320130UL, 262100UL, 210703UL,
+    224941UL, 241830UL, 228165UL, 163678UL, 321942UL, 18729UL,  480164UL, 265887UL, 26815UL,  160172UL, 230333UL,
+    472113UL, 367928UL, 181769UL, 217974UL, 409493UL, 461123UL, 294692UL, 409138UL, 216241UL, 380594UL, 170367UL,
+    80318UL,  318407UL, 62516UL,  231773UL, 104948UL, 78942UL,  290721UL, 8783UL,   71744UL,  501793UL, 274622UL,
+    460206UL, 283037UL, 154442UL, 434505UL, 10486UL,  440535UL, 210911UL, 486197UL, 125213UL, 463247UL, 122299UL,
+    8889UL,
+};
+uint32_t rand_arr_20_b20_w32_arr[1024] = {
+    471063UL,  83984UL,   431047UL,  553056UL,  928648UL,  515388UL,  54235UL,   1032573UL, 1015296UL, 650360UL,
+    601989UL,  84614UL,   485398UL,  958325UL,  597197UL,  40219UL,   552111UL,  146781UL,  976308UL,  757831UL,
+    69708UL,   701461UL,  587604UL,  1025264UL, 60204UL,   350431UL,  383268UL,  697890UL,  484245UL,  165434UL,
+    442679UL,  643096UL,  189024UL,  914263UL,  462696UL,  688086UL,  561319UL,  628060UL,  608059UL,  521455UL,
+    158042UL,  554650UL,  355275UL,  206140UL,  440716UL,  897957UL,  111817UL,  353857UL,  508931UL,  666292UL,
+    799209UL,  417535UL,  881025UL,  468229UL,  930080UL,  867003UL,  876937UL,  340206UL,  32153UL,   180709UL,
+    971297UL,  449428UL,  520743UL,  786634UL,  972362UL,  597539UL,  454324UL,  43241UL,   357564UL,  1020226UL,
+    306882UL,  880007UL,  486221UL,  813879UL,  981345UL,  300974UL,  415488UL,  297983UL,  79385UL,   601088UL,
+    318249UL,  84801UL,   782703UL,  150966UL,  111729UL,  183745UL,  515746UL,  829091UL,  386146UL,  879034UL,
+    833684UL,  191476UL,  908173UL,  495753UL,  1007730UL, 326159UL,  646910UL,  185185UL,  274586UL,  450266UL,
+    720195UL,  651143UL,  248619UL,  461830UL,  554630UL,  980159UL,  883548UL,  324944UL,  1015728UL, 432808UL,
+    382995UL,  850268UL,  253940UL,  59311UL,   984905UL,  387683UL,  834779UL,  440807UL,  169252UL,  93673UL,
+    782730UL,  739564UL,  364851UL,  314302UL,  451694UL,  516075UL,  90686UL,   884193UL,  268981UL,  118847UL,
+    970368UL,  538746UL,  530281UL,  721835UL,  362088UL,  136501UL,  902969UL,  307307UL,  436513UL,  538176UL,
+    483279UL,  990629UL,  161482UL,  290898UL,  139254UL,  356917UL,  206512UL,  929401UL,  714062UL,  779022UL,
+    112971UL,  145545UL,  866907UL,  654055UL,  727431UL,  914477UL,  473749UL,  703819UL,  409426UL,  526007UL,
+    527040UL,  472560UL,  481940UL,  285871UL,  573295UL,  926254UL,  35880UL,   486886UL,  550128UL,  183957UL,
+    1028208UL, 905872UL,  315930UL,  52999UL,   228201UL,  487866UL,  248688UL,  713429UL,  917101UL,  603316UL,
+    860210UL,  371466UL,  1021657UL, 297376UL,  729057UL,  409808UL,  763232UL,  113416UL,  988299UL,  759935UL,
+    150372UL,  87959UL,   766350UL,  537262UL,  821763UL,  186198UL,  878111UL,  711311UL,  148342UL,  802632UL,
+    92706UL,   1024279UL, 969748UL,  613125UL,  1033092UL, 284286UL,  525649UL,  15265UL,   89512UL,   476699UL,
+    559081UL,  60041UL,   460820UL,  90163UL,   86784UL,   33841UL,   4805UL,    945510UL,  37591UL,   718607UL,
+    1029115UL, 25646UL,   529514UL,  480792UL,  476713UL,  710227UL,  207631UL,  607923UL,  777245UL,  463501UL,
+    419071UL,  50369UL,   751285UL,  978237UL,  198217UL,  243854UL,  579402UL,  95932UL,   313497UL,  996343UL,
+    770010UL,  230626UL,  67018UL,   682096UL,  805938UL,  172119UL,  295857UL,  407182UL,  617531UL,  306280UL,
+    1007529UL, 525225UL,  793796UL,  146740UL,  520689UL,  513049UL,  693328UL,  291203UL,  542118UL,  1028267UL,
+    888637UL,  580952UL,  231052UL,  683476UL,  667998UL,  901877UL,  107158UL,  805205UL,  487024UL,  898034UL,
+    302226UL,  61440UL,   726569UL,  665493UL,  358354UL,  630873UL,  859611UL,  762008UL,  36890UL,   749245UL,
+    327446UL,  400996UL,  457607UL,  806719UL,  452294UL,  515826UL,  20466UL,   551551UL,  842831UL,  16117UL,
+    843208UL,  439992UL,  963542UL,  731584UL,  674931UL,  1038476UL, 463569UL,  8007UL,    836061UL,  778266UL,
+    234812UL,  443871UL,  700330UL,  43814UL,   72707UL,   750190UL,  655803UL,  972021UL,  358973UL,  768831UL,
+    18745UL,   929782UL,  706084UL,  941800UL,  270828UL,  668553UL,  174397UL,  703173UL,  614982UL,  499841UL,
+    615773UL,  329429UL,  889451UL,  240587UL,  923439UL,  799175UL,  766406UL,  842368UL,  222165UL,  536099UL,
+    823874UL,  222943UL,  816122UL,  485282UL,  777765UL,  189655UL,  601746UL,  368263UL,  651203UL,  541901UL,
+    96723UL,   82154UL,   707672UL,  1003159UL, 567736UL,  415137UL,  414390UL,  1001562UL, 225447UL,  975866UL,
+    613547UL,  450722UL,  507741UL,  943501UL,  988217UL,  115239UL,  17461UL,   412249UL,  648576UL,  411654UL,
+    1008194UL, 721209UL,  1013031UL, 917929UL,  643487UL,  326442UL,  187150UL,  568066UL,  386021UL,  329976UL,
+    538828UL,  396829UL,  569429UL,  466046UL,  529241UL,  398036UL,  918289UL,  312805UL,  608454UL,  160409UL,
+    1037811UL, 386342UL,  601171UL,  569860UL,  692228UL,  422742UL,  217379UL,  868307UL,  146652UL,  798034UL,
+    581716UL,  600326UL,  201279UL,  1002663UL, 383571UL,  301391UL,  810196UL,  933127UL,  406843UL,  891122UL,
+    846703UL,  674396UL,  169670UL,  160267UL,  389499UL,  936381UL,  109585UL,  990150UL,  530134UL,  349928UL,
+    992161UL,  798758UL,  571824UL,  238994UL,  722508UL,  927278UL,  370382UL,  103983UL,  137011UL,  163586UL,
+    731861UL,  396903UL,  1008562UL, 748462UL,  453862UL,  647867UL,  454673UL,  200742UL,  411120UL,  466587UL,
+    714420UL,  891120UL,  702834UL,  725651UL,  109569UL,  33271UL,   599117UL,  1020440UL, 501623UL,  788589UL,
+    471559UL,  376616UL,  147311UL,  823760UL,  1046920UL, 638682UL,  819005UL,  51684UL,   833983UL,  779424UL,
+    77186UL,   313968UL,  162979UL,  763492UL,  748523UL,  265963UL,  442513UL,  105119UL,  645796UL,  195469UL,
+    805955UL,  461037UL,  424865UL,  435249UL,  695750UL,  605145UL,  813258UL,  919541UL,  991069UL,  580400UL,
+    868061UL,  187793UL,  184833UL,  890601UL,  932290UL,  448784UL,  764181UL,  439724UL,  300101UL,  592992UL,
+    634077UL,  633318UL,  696798UL,  346613UL,  593901UL,  1031175UL, 1043351UL, 478906UL,  231390UL,  100484UL,
+    426275UL,  793230UL,  306978UL,  1042864UL, 891942UL,  757875UL,  540891UL,  180334UL,  611117UL,  853142UL,
+    978229UL,  357036UL,  264256UL,  12151UL,   927878UL,  78670UL,   343845UL,  347431UL,  455414UL,  390708UL,
+    822892UL,  532977UL,  1001829UL, 15025UL,   2054UL,    791933UL,  406663UL,  21352UL,   749972UL,  191039UL,
+    785991UL,  145875UL,  436985UL,  818687UL,  461983UL,  631996UL,  86441UL,   794294UL,  709640UL,  578191UL,
+    792715UL,  405508UL,  859484UL,  515913UL,  12551UL,   491722UL,  638465UL,  202723UL,  76700UL,   944515UL,
+    200916UL,  1009396UL, 686629UL,  992982UL,  181123UL,  997721UL,  232551UL,  364668UL,  842070UL,  736962UL,
+    177917UL,  359207UL,  967096UL,  645643UL,  676017UL,  254303UL,  195034UL,  939802UL,  322779UL,  525376UL,
+    897661UL,  196477UL,  769320UL,  169242UL,  469792UL,  264454UL,  393725UL,  430550UL,  241683UL,  260555UL,
+    278961UL,  309143UL,  441488UL,  606660UL,  561738UL,  675550UL,  399808UL,  6091UL,    647653UL,  358033UL,
+    150590UL,  43199UL,   623330UL,  695754UL,  864919UL,  476634UL,  964697UL,  873774UL,  67438UL,   706251UL,
+    317301UL,  339786UL,  636425UL,  524045UL,  302445UL,  155679UL,  160371UL,  488247UL,  716684UL,  917960UL,
+    480314UL,  880063UL,  724642UL,  303695UL,  500534UL,  935103UL,  638124UL,  707750UL,  925105UL,  849126UL,
+    770836UL,  15729UL,   394114UL,  265338UL,  808710UL,  1010379UL, 90232UL,   650366UL,  778937UL,  195529UL,
+    508036UL,  814331UL,  425674UL,  780359UL,  280698UL,  768927UL,  29016UL,   541893UL,  419516UL,  102254UL,
+    916262UL,  900974UL,  967187UL,  379713UL,  633011UL,  483726UL,  719736UL,  294354UL,  421806UL,  713730UL,
+    678333UL,  165481UL,  615394UL,  464342UL,  220898UL,  853316UL,  1023479UL, 922626UL,  52642UL,   900762UL,
+    306624UL,  581004UL,  637477UL,  526179UL,  1003666UL, 274542UL,  899899UL,  311688UL,  248343UL,  904357UL,
+    977164UL,  58684UL,   673315UL,  885857UL,  40593UL,   326712UL,  475318UL,  585491UL,  197521UL,  731602UL,
+    790902UL,  11337UL,   585920UL,  1044440UL, 337796UL,  332729UL,  132854UL,  119887UL,  579720UL,  701753UL,
+    899384UL,  1027005UL, 410287UL,  81955UL,   458524UL,  673710UL,  739980UL,  993013UL,  744701UL,  535744UL,
+    389691UL,  629723UL,  960360UL,  972657UL,  481338UL,  6795UL,    763592UL,  373328UL,  122807UL,  983373UL,
+    456853UL,  150550UL,  652700UL,  719351UL,  71953UL,   142748UL,  539858UL,  455913UL,  420208UL,  641975UL,
+    520694UL,  498068UL,  154358UL,  1033595UL, 968177UL,  356481UL,  318748UL,  29088UL,   194672UL,  76285UL,
+    378254UL,  974336UL,  223672UL,  767652UL,  465733UL,  148262UL,  1018604UL, 195612UL,  933242UL,  902696UL,
+    833585UL,  852647UL,  26108UL,   35424UL,   639119UL,  804824UL,  176975UL,  366733UL,  949747UL,  413573UL,
+    502001UL,  415811UL,  209639UL,  387719UL,  1043161UL, 165850UL,  895850UL,  73418UL,   767369UL,  182719UL,
+    709222UL,  779723UL,  1048238UL, 391271UL,  232028UL,  657557UL,  84139UL,   542265UL,  16585UL,   831340UL,
+    717120UL,  446423UL,  220426UL,  345506UL,  598944UL,  700845UL,  476831UL,  991309UL,  60901UL,   982660UL,
+    187566UL,  460357UL,  876832UL,  515177UL,  430046UL,  605893UL,  463738UL,  908793UL,  243080UL,  527667UL,
+    882669UL,  966999UL,  896930UL,  254258UL,  535400UL,  33813UL,   728524UL,  123560UL,  64401UL,   470324UL,
+    267875UL,  619455UL,  175416UL,  314799UL,  917311UL,  613102UL,  180849UL,  663307UL,  356197UL,  683473UL,
+    35271UL,   655027UL,  996604UL,  721806UL,  677310UL,  379071UL,  808927UL,  947300UL,  201614UL,  361162UL,
+    778749UL,  579972UL,  1021558UL, 370349UL,  360764UL,  735517UL,  756471UL,  424236UL,  827795UL,  299267UL,
+    637524UL,  282126UL,  92281UL,   307326UL,  727411UL,  557566UL,  690745UL,  85522UL,   540246UL,  49687UL,
+    757802UL,  443953UL,  760910UL,  527985UL,  922072UL,  162255UL,  364567UL,  223830UL,  379379UL,  342614UL,
+    655561UL,  294092UL,  940325UL,  550577UL,  736103UL,  177101UL,  101244UL,  687015UL,  369968UL,  798201UL,
+    186468UL,  182161UL,  124186UL,  641025UL,  802588UL,  403031UL,  798030UL,  305081UL,  89809UL,   412774UL,
+    813215UL,  95378UL,   264591UL,  305304UL,  310584UL,  967506UL,  655779UL,  671739UL,  105341UL,  649791UL,
+    509199UL,  542715UL,  496953UL,  283819UL,  48080UL,   1039897UL, 1036476UL, 329950UL,  504754UL,  548260UL,
+    560934UL,  251352UL,  770809UL,  1034444UL, 672794UL,  418216UL,  758528UL,  297276UL,  589697UL,  396801UL,
+    750121UL,  505625UL,  172986UL,  259858UL,  395205UL,  446396UL,  402689UL,  266771UL,  198456UL,  866129UL,
+    482134UL,  79377UL,   260024UL,  636295UL,  1007269UL, 357916UL,  648399UL,  742288UL,  64247UL,   962054UL,
+    725537UL,  807513UL,  53145UL,   128139UL,  994373UL,  759264UL,  125042UL,  375121UL,  856570UL,  300891UL,
+    1023569UL, 260496UL,  824574UL,  944605UL,  821392UL,  687004UL,  237589UL,  447750UL,  374495UL,  654720UL,
+    314844UL,  874339UL,  637204UL,  321940UL,  483758UL,  205651UL,  752306UL,  393391UL,  1021232UL, 965154UL,
+    443825UL,  982392UL,  528061UL,  580082UL,  1044227UL, 967144UL,  498046UL,  589781UL,  554466UL,  769650UL,
+    375381UL,  726343UL,  769399UL,  406875UL,  691107UL,  746025UL,  20631UL,   852009UL,  38319UL,   1039819UL,
+    772219UL,  1027057UL, 533437UL,  41224UL,   527330UL,  1007133UL, 432713UL,  486486UL,  143172UL,  777714UL,
+    356253UL,  81868UL,   330196UL,  513537UL,  935577UL,  879348UL,  119681UL,  594074UL,  325869UL,  73776UL,
+    729809UL,  219918UL,  296525UL,  167161UL,  998513UL,  685496UL,  833780UL,  698263UL,  168862UL,  305808UL,
+    710550UL,  93573UL,   966797UL,  333079UL,  32680UL,   434890UL,  25379UL,   606263UL,  575823UL,  343060UL,
+    2611UL,    678157UL,  811114UL,  968278UL,  68710UL,   537124UL,  733319UL,  526246UL,  701394UL,  152829UL,
+    647891UL,  666136UL,  450764UL,  452788UL,  381354UL,  360270UL,  963136UL,  219547UL,  825015UL,  337280UL,
+    248950UL,  711840UL,  736983UL,  194011UL,
+};
+uint32_t rand_arr_21_b21_w32_arr[1024] = {
+    266667UL,  1883346UL, 1148929UL, 2005173UL, 858444UL,  1438619UL, 1727549UL, 1903388UL, 1226488UL, 502536UL,
+    1562257UL, 1715271UL, 1314847UL, 931937UL,  1395712UL, 1238082UL, 252454UL,  1444997UL, 1221842UL, 1074690UL,
+    104104UL,  938615UL,  195020UL,  773635UL,  603889UL,  24282UL,   211048UL,  1849128UL, 1104617UL, 1395750UL,
+    1548897UL, 507091UL,  1888724UL, 331974UL,  1229391UL, 2063249UL, 199090UL,  1691841UL, 2069230UL, 1652544UL,
+    2059771UL, 605130UL,  1533597UL, 31705UL,   1444524UL, 1832884UL, 1699698UL, 796364UL,  838888UL,  121183UL,
+    1610184UL, 1613427UL, 481481UL,  1965681UL, 2047680UL, 981745UL,  282358UL,  248686UL,  205466UL,  977289UL,
+    1404014UL, 1757964UL, 1799502UL, 1668844UL, 1110651UL, 1965364UL, 996149UL,  2045434UL, 113579UL,  1098984UL,
+    984867UL,  1259375UL, 153791UL,  886530UL,  1893460UL, 927847UL,  1428859UL, 2044562UL, 768456UL,  364785UL,
+    279409UL,  874605UL,  466697UL,  1175667UL, 1157159UL, 186679UL,  1301841UL, 1829251UL, 1678312UL, 2003464UL,
+    1629710UL, 960998UL,  901274UL,  838751UL,  1818793UL, 1122404UL, 838044UL,  1803018UL, 1236814UL, 1704931UL,
+    197024UL,  2061000UL, 1633219UL, 1176393UL, 590872UL,  1214709UL, 1584721UL, 1402307UL, 848511UL,  32377UL,
+    369863UL,  196019UL,  1469453UL, 925957UL,  698727UL,  508247UL,  1432642UL, 458319UL,  863918UL,  2009771UL,
+    1417000UL, 1154624UL, 1639590UL, 32600UL,   1802321UL, 1350382UL, 1471076UL, 1443265UL, 1117306UL, 1650853UL,
+    1572802UL, 778393UL,  1094192UL, 118362UL,  1868300UL, 1302266UL, 2078886UL, 129274UL,  1339999UL, 1184751UL,
+    1812507UL, 1755855UL, 35814UL,   1079579UL, 453954UL,  1723103UL, 574531UL,  510873UL,  818705UL,  1012773UL,
+    229649UL,  872205UL,  704775UL,  1746281UL, 656954UL,  1236140UL, 542558UL,  996373UL,  628011UL,  765596UL,
+    1311233UL, 195266UL,  467372UL,  2087722UL, 2013864UL, 29098UL,   1760941UL, 2008089UL, 1513765UL, 542660UL,
+    1177453UL, 414177UL,  401777UL,  1906856UL, 1597091UL, 908666UL,  1806562UL, 931569UL,  862424UL,  1841078UL,
+    994581UL,  803287UL,  1121519UL, 1655588UL, 1778066UL, 21994UL,   431186UL,  2089423UL, 787003UL,  145973UL,
+    826588UL,  223627UL,  512440UL,  1606785UL, 902506UL,  1727728UL, 485501UL,  137020UL,  474854UL,  635585UL,
+    1074028UL, 790693UL,  443795UL,  1473952UL, 205248UL,  40108UL,   1284262UL, 1576483UL, 1757158UL, 275117UL,
+    83931UL,   1850585UL, 1936223UL, 1928016UL, 427939UL,  984087UL,  1425156UL, 1533520UL, 979948UL,  428143UL,
+    1484290UL, 1246557UL, 1101573UL, 1459226UL, 737777UL,  1314135UL, 1124058UL, 1176385UL, 250917UL,  737177UL,
+    57868UL,   1859571UL, 980212UL,  732329UL,  1618276UL, 4100UL,    1178745UL, 947424UL,  1507132UL, 1789919UL,
+    283583UL,  199775UL,  604477UL,  9459UL,    873466UL,  752541UL,  1280504UL, 1628856UL, 1535589UL, 919167UL,
+    246493UL,  1459119UL, 941664UL,  476539UL,  1560157UL, 454018UL,  280577UL,  1548338UL, 1223127UL, 17066UL,
+    1510907UL, 1785451UL, 1559225UL, 705906UL,  808667UL,  39080UL,   967755UL,  2029019UL, 270368UL,  1458371UL,
+    557958UL,  1671253UL, 174770UL,  1614399UL, 1075989UL, 416539UL,  2046062UL, 324446UL,  132157UL,  969419UL,
+    1463059UL, 1977063UL, 256415UL,  740318UL,  1410778UL, 261056UL,  338822UL,  526393UL,  724154UL,  26985UL,
+    1627201UL, 441318UL,  261351UL,  271059UL,  321142UL,  380857UL,  686576UL,  1853299UL, 2014149UL, 1711825UL,
+    1699742UL, 121922UL,  619116UL,  1200226UL, 1531928UL, 2028962UL, 12862UL,   759908UL,  1493691UL, 366966UL,
+    1541701UL, 656599UL,  842398UL,  1946355UL, 1883554UL, 832710UL,  1708832UL, 1217689UL, 1985836UL, 1124791UL,
+    842195UL,  1249425UL, 1349560UL, 1129146UL, 1097162UL, 729161UL,  1642525UL, 267018UL,  971421UL,  1668168UL,
+    27685UL,   1543194UL, 549123UL,  1148442UL, 733293UL,  349158UL,  524100UL,  447213UL,  555801UL,  1741181UL,
+    1120480UL, 1338542UL, 217093UL,  405842UL,  1008358UL, 1736758UL, 1946599UL, 1691865UL, 1930626UL, 1391576UL,
+    1075747UL, 1114736UL, 280913UL,  1617476UL, 987329UL,  759698UL,  1936656UL, 65687UL,   824914UL,  1074228UL,
+    1242772UL, 1710031UL, 1960319UL, 1269182UL, 1525391UL, 2090008UL, 448225UL,  685181UL,  1821713UL, 494308UL,
+    285564UL,  567155UL,  1016530UL, 1710819UL, 2001322UL, 1180985UL, 219580UL,  1542119UL, 397721UL,  1609841UL,
+    1743108UL, 1991405UL, 1846548UL, 458172UL,  932802UL,  559330UL,  1754437UL, 799699UL,  1504324UL, 607307UL,
+    1411495UL, 395895UL,  598895UL,  703643UL,  1249450UL, 201517UL,  1279641UL, 387371UL,  1920708UL, 1256798UL,
+    188375UL,  393867UL,  1253551UL, 672716UL,  859280UL,  1378299UL, 1471452UL, 175779UL,  1204446UL, 214709UL,
+    1958268UL, 1539639UL, 345954UL,  740521UL,  1801660UL, 1444510UL, 85351UL,   695048UL,  230931UL,  79574UL,
+    1829222UL, 1472464UL, 959196UL,  1039260UL, 1221732UL, 693191UL,  727030UL,  482038UL,  1657518UL, 1597463UL,
+    1706135UL, 1272638UL, 841862UL,  1159330UL, 1123610UL, 1997009UL, 129905UL,  1203078UL, 1581359UL, 1955078UL,
+    1444581UL, 1740644UL, 1907522UL, 1621867UL, 1298965UL, 1071501UL, 1938014UL, 261750UL,  1947138UL, 577155UL,
+    804264UL,  774683UL,  1886003UL, 345046UL,  634315UL,  1104423UL, 671978UL,  507157UL,  285672UL,  296019UL,
+    1492995UL, 767266UL,  1817301UL, 1550548UL, 1957791UL, 839482UL,  148931UL,  1794524UL, 1164305UL, 1995765UL,
+    58746UL,   1355848UL, 1458853UL, 272978UL,  667834UL,  1007428UL, 522493UL,  1698017UL, 784531UL,  254281UL,
+    1739517UL, 169097UL,  1897270UL, 2079144UL, 1902820UL, 141632UL,  1704456UL, 318141UL,  309066UL,  1189945UL,
+    1222806UL, 245022UL,  864210UL,  782972UL,  1090724UL, 1734486UL, 1527469UL, 872694UL,  1339155UL, 1376498UL,
+    276963UL,  1089615UL, 269352UL,  46348UL,   693111UL,  120224UL,  2034383UL, 1050267UL, 146868UL,  2019216UL,
+    2064758UL, 1453056UL, 1502656UL, 805484UL,  2039332UL, 949570UL,  616322UL,  1846954UL, 1213786UL, 1286588UL,
+    1274703UL, 1915735UL, 1797218UL, 914915UL,  1861402UL, 465714UL,  1367395UL, 1972771UL, 658373UL,  145085UL,
+    1005600UL, 1125527UL, 1522170UL, 497186UL,  516936UL,  1034184UL, 1151755UL, 1071268UL, 1016437UL, 1328960UL,
+    1699322UL, 1463433UL, 892528UL,  321397UL,  624733UL,  1055524UL, 747316UL,  464823UL,  612996UL,  2000580UL,
+    1707736UL, 519975UL,  19016UL,   973827UL,  1798233UL, 159431UL,  1646427UL, 1942529UL, 1550828UL, 93897UL,
+    1774658UL, 939722UL,  286317UL,  1630009UL, 1517372UL, 1132637UL, 1969275UL, 1244706UL, 273116UL,  802043UL,
+    1383147UL, 1309478UL, 480849UL,  1954674UL, 933560UL,  99402UL,   222031UL,  1951950UL, 2058328UL, 1466691UL,
+    1231676UL, 1268042UL, 1693306UL, 1544889UL, 802646UL,  400396UL,  2053532UL, 1758394UL, 1946264UL, 1934875UL,
+    130489UL,  1466040UL, 235788UL,  1029101UL, 324535UL,  1380922UL, 383442UL,  623725UL,  599131UL,  271219UL,
+    1321284UL, 312925UL,  1188868UL, 346327UL,  1742399UL, 1595553UL, 1915210UL, 986558UL,  1757484UL, 1463380UL,
+    1888454UL, 1314705UL, 88506UL,   502447UL,  927120UL,  544946UL,  368783UL,  492346UL,  1145218UL, 1361732UL,
+    18732UL,   1201226UL, 342156UL,  167371UL,  1051301UL, 1202009UL, 722703UL,  311474UL,  513477UL,  1882532UL,
+    1160697UL, 2087019UL, 700849UL,  235544UL,  1511602UL, 1590224UL, 163037UL,  316397UL,  1685663UL, 1574104UL,
+    1782974UL, 1258101UL, 843193UL,  1053793UL, 531151UL,  1285793UL, 1845605UL, 2013581UL, 324681UL,  665446UL,
+    1460704UL, 1141639UL, 1915193UL, 1379006UL, 1177236UL, 1921283UL, 648001UL,  1868840UL, 225501UL,  792408UL,
+    1525510UL, 1279093UL, 101281UL,  1554830UL, 967605UL,  1480778UL, 1070883UL, 171019UL,  1279075UL, 49711UL,
+    921807UL,  1355554UL, 1502953UL, 1968528UL, 698759UL,  858928UL,  500511UL,  216618UL,  665018UL,  1497480UL,
+    1817450UL, 1074164UL, 1487761UL, 1247258UL, 639588UL,  261101UL,  633094UL,  1639538UL, 1670914UL, 1980217UL,
+    1295983UL, 694353UL,  425964UL,  59129UL,   1514192UL, 1442341UL, 1659698UL, 1553765UL, 1233502UL, 1677621UL,
+    334198UL,  745862UL,  1535803UL, 1951203UL, 1647756UL, 184977UL,  827964UL,  1571460UL, 916698UL,  1007022UL,
+    983046UL,  878669UL,  1141772UL, 368940UL,  1192715UL, 445119UL,  70174UL,   257450UL,  1283082UL, 1917043UL,
+    1181827UL, 1018751UL, 768059UL,  503419UL,  112983UL,  580161UL,  1193420UL, 1438144UL, 1643989UL, 1637190UL,
+    2065044UL, 888093UL,  158858UL,  1033387UL, 28676UL,   430666UL,  1637906UL, 418088UL,  2044971UL, 1152651UL,
+    2082024UL, 1090065UL, 1205302UL, 177923UL,  1497772UL, 1587157UL, 648754UL,  1919106UL, 473265UL,  1000217UL,
+    1605467UL, 1640540UL, 1942953UL, 389702UL,  502693UL,  524020UL,  524567UL,  462922UL,  1183547UL, 76236UL,
+    1458172UL, 303590UL,  1377032UL, 1842576UL, 1480278UL, 1919313UL, 1180913UL, 546763UL,  185334UL,  2070125UL,
+    937629UL,  1821097UL, 1904834UL, 502000UL,  523295UL,  348384UL,  850414UL,  1090413UL, 340257UL,  472807UL,
+    22366UL,   1609591UL, 516186UL,  1002275UL, 1196783UL, 940539UL,  888905UL,  1120011UL, 1601735UL, 393991UL,
+    38860UL,   1711350UL, 442042UL,  728657UL,  126967UL,  1400628UL, 159256UL,  621021UL,  192244UL,  1119987UL,
+    409936UL,  890302UL,  538890UL,  1955654UL, 1428728UL, 777408UL,  660513UL,  477989UL,  1086793UL, 1397707UL,
+    1643644UL, 823618UL,  1269165UL, 1101200UL, 266823UL,  1252232UL, 941637UL,  210708UL,  1607396UL, 268349UL,
+    363268UL,  870480UL,  414992UL,  1249214UL, 565686UL,  712519UL,  652361UL,  437538UL,  1375612UL, 706293UL,
+    2068233UL, 488121UL,  1232470UL, 633711UL,  1750532UL, 876023UL,  1457461UL, 1892277UL, 1698874UL, 340774UL,
+    2021296UL, 1470968UL, 1657223UL, 1394744UL, 51246UL,   1556459UL, 1994125UL, 1321794UL, 492745UL,  822581UL,
+    162650UL,  1065842UL, 2066007UL, 1952082UL, 398662UL,  899537UL,  127075UL,  2482UL,    1580224UL, 841156UL,
+    1620131UL, 819657UL,  315277UL,  292361UL,  1992581UL, 548633UL,  92385UL,   475552UL,  561187UL,  1093068UL,
+    1284275UL, 1238922UL, 1526431UL, 317769UL,  1319821UL, 1219562UL, 1011462UL, 626579UL,  435422UL,  1842846UL,
+    242765UL,  642636UL,  927365UL,  970084UL,  909776UL,  320888UL,  1462681UL, 1346357UL, 1061234UL, 2039296UL,
+    607538UL,  1758317UL, 1980542UL, 971621UL,  163770UL,  1117032UL, 769887UL,  503890UL,  40944UL,   1113451UL,
+    1108435UL, 1061646UL, 1960830UL, 1008182UL, 2050427UL, 1437389UL, 1979420UL, 621842UL,  584085UL,  2063076UL,
+    925622UL,  385341UL,  1010118UL, 1852592UL, 473275UL,  1092069UL, 949844UL,  698006UL,  961141UL,  1414460UL,
+    1072204UL, 1444168UL, 147710UL,  1881260UL, 929092UL,  1555657UL, 1772966UL, 157718UL,  415120UL,  504834UL,
+    870957UL,  1005870UL, 256746UL,  295368UL,  1833852UL, 531060UL,  576545UL,  572053UL,  79934UL,   60464UL,
+    1657512UL, 1448540UL, 982661UL,  2044289UL, 680612UL,  1569339UL, 502104UL,  58343UL,   919478UL,  140718UL,
+    1113665UL, 46771UL,   447207UL,  1064553UL, 567767UL,  2072165UL, 1793865UL, 1411661UL, 1932076UL, 1094169UL,
+    1052421UL, 605749UL,  710630UL,  825759UL,  150752UL,  1813425UL, 735554UL,  1427412UL, 1255936UL, 790837UL,
+    1471451UL, 1825757UL, 1946098UL, 617111UL,  1519928UL, 1407821UL, 389730UL,  1169092UL, 1444535UL, 1016397UL,
+    186721UL,  1326822UL, 1745376UL, 1283065UL, 1790746UL, 155386UL,  1199761UL, 45975UL,   684975UL,  889949UL,
+    1400642UL, 749976UL,  1427764UL, 1035892UL, 939927UL,  256370UL,  398214UL,  2068945UL, 2055822UL, 356414UL,
+    62377UL,   1517185UL, 1040355UL, 1310700UL, 1378360UL, 292406UL,  291630UL,  764544UL,  1977431UL, 232936UL,
+    505620UL,  2049536UL, 297745UL,  2052921UL, 812345UL,  779340UL,  146312UL,  769178UL,  1381805UL, 279621UL,
+    490143UL,  668926UL,  328015UL,  1718580UL,
+};
+uint32_t rand_arr_22_b22_w32_arr[1024] = {
+    3964267UL, 1671129UL, 467930UL,  2333886UL, 161849UL,  1892944UL, 1198323UL, 554443UL,  2447389UL, 643871UL,
+    3866489UL, 2331197UL, 4158639UL, 3771400UL, 2878674UL, 2379709UL, 1974405UL, 1234153UL, 1894872UL, 309410UL,
+    1625804UL, 3216782UL, 3298503UL, 534659UL,  3824990UL, 3339637UL, 3959897UL, 3853185UL, 45803UL,   1882499UL,
+    138124UL,  495709UL,  1359624UL, 933447UL,  1737491UL, 2965223UL, 2882279UL, 293916UL,  3588740UL, 2871235UL,
+    1883894UL, 733433UL,  964086UL,  1223585UL, 3101974UL, 1255879UL, 3111910UL, 2147172UL, 2910851UL, 3920940UL,
+    3846123UL, 2029148UL, 1861425UL, 1662736UL, 3249666UL, 2578927UL, 3477510UL, 2672044UL, 2151659UL, 1709109UL,
+    797698UL,  2267436UL, 2958035UL, 1295393UL, 2408487UL, 3142939UL, 21348UL,   59174UL,   153149UL,  877317UL,
+    2992999UL, 3227715UL, 3445519UL, 4052185UL, 1675829UL, 1709764UL, 1782753UL, 1537605UL, 2809327UL, 2407290UL,
+    2806334UL, 1312490UL, 2419134UL, 1825471UL, 2824586UL, 153606UL,  916183UL,  1881740UL, 450480UL,  2321744UL,
+    3836885UL, 150593UL,  3924335UL, 149291UL,  1218106UL, 726265UL,  2628333UL, 3122905UL, 1628249UL, 676072UL,
+    2538300UL, 2211328UL, 2421111UL, 228421UL,  1585635UL, 896544UL,  3775282UL, 1257554UL, 2020304UL, 1227874UL,
+    1283808UL, 2121507UL, 824184UL,  1173897UL, 1711849UL, 2834194UL, 2783320UL, 1728901UL, 2745700UL, 3618353UL,
+    2681043UL, 128808UL,  1089337UL, 2879460UL, 1264254UL, 41173UL,   3870664UL, 2003825UL, 904363UL,  3588603UL,
+    2911844UL, 650334UL,  4184911UL, 1023963UL, 1469937UL, 3110052UL, 1019517UL, 1678465UL, 3382319UL, 151203UL,
+    3700156UL, 2103185UL, 2594399UL, 1911238UL, 1692468UL, 4178274UL, 1976255UL, 2640014UL, 2958982UL, 198029UL,
+    2940243UL, 3671246UL, 3510604UL, 2471237UL, 704926UL,  2899428UL, 1671389UL, 2295752UL, 3353745UL, 1653552UL,
+    3478327UL, 940151UL,  2600670UL, 823565UL,  2085539UL, 614677UL,  2896054UL, 1697380UL, 2341740UL, 1868658UL,
+    2828934UL, 2990081UL, 2207543UL, 2415754UL, 1174551UL, 1276730UL, 327901UL,  3933462UL, 2276750UL, 2641175UL,
+    3448527UL, 435290UL,  2990214UL, 4011868UL, 1813513UL, 1429133UL, 1601028UL, 1434770UL, 1029291UL, 89336UL,
+    2246646UL, 2030863UL, 2800904UL, 2375049UL, 3339209UL, 248456UL,  2902458UL, 983111UL,  3306289UL, 798668UL,
+    1769968UL, 3433126UL, 1946958UL, 2501627UL, 1709693UL, 743667UL,  1373886UL, 3323825UL, 2183146UL, 1825029UL,
+    2238079UL, 1642836UL, 3637713UL, 3564691UL, 2287198UL, 2583028UL, 4002923UL, 3391205UL, 2800205UL, 3886530UL,
+    68154UL,   1072904UL, 899717UL,  2402049UL, 2774576UL, 2970949UL, 2262522UL, 3787759UL, 1670656UL, 87877UL,
+    3835510UL, 4077216UL, 2871780UL, 3095930UL, 1750944UL, 307362UL,  1928432UL, 2726449UL, 1651694UL, 361499UL,
+    3086837UL, 3997850UL, 1662435UL, 1340705UL, 4115414UL, 1804108UL, 1562998UL, 394967UL,  1819623UL, 2129571UL,
+    3187255UL, 2811257UL, 1276678UL, 4160103UL, 3324521UL, 1971912UL, 1668915UL, 1824974UL, 3929985UL, 1676607UL,
+    804540UL,  3475756UL, 2371667UL, 3193681UL, 3498292UL, 1825399UL, 1937157UL, 3147523UL, 676018UL,  2523952UL,
+    1471659UL, 3240520UL, 3666824UL, 1256004UL, 1018072UL, 3755737UL, 4096009UL, 2097127UL, 2216418UL, 1757097UL,
+    1439323UL, 2286998UL, 737443UL,  3910434UL, 3461782UL, 3063887UL, 4065719UL, 3447879UL, 1252068UL, 110213UL,
+    2034719UL, 2165515UL, 2443668UL, 1716832UL, 1168878UL, 2030817UL, 952015UL,  3022207UL, 3097330UL, 449311UL,
+    2168049UL, 3390749UL, 4184535UL, 502724UL,  625315UL,  2457178UL, 1424026UL, 4002358UL, 4064904UL, 1693335UL,
+    3543222UL, 3792736UL, 3871314UL, 2880182UL, 4128310UL, 3281572UL, 2740896UL, 3541705UL, 572981UL,  4047044UL,
+    2379434UL, 392151UL,  2947624UL, 1076106UL, 1625667UL, 274473UL,  1001440UL, 2247031UL, 989677UL,  542089UL,
+    2306062UL, 1599116UL, 2922046UL, 2626460UL, 1603944UL, 2837309UL, 777619UL,  421595UL,  1184040UL, 380917UL,
+    4109542UL, 1387210UL, 668988UL,  3073335UL, 2265722UL, 3197459UL, 122240UL,  2620883UL, 3529664UL, 3052777UL,
+    2858215UL, 2378023UL, 1093210UL, 94270UL,   4068879UL, 1264433UL, 2417277UL, 1763027UL, 1166690UL, 1726840UL,
+    2775482UL, 3479445UL, 385784UL,  2340025UL, 2935150UL, 2987946UL, 3651772UL, 3524121UL, 39950UL,   3811878UL,
+    1168898UL, 251287UL,  3846911UL, 350651UL,  2684069UL, 1163849UL, 3409702UL, 1385517UL, 727586UL,  1265905UL,
+    672186UL,  2349395UL, 334376UL,  2816752UL, 2887570UL, 1268096UL, 2966869UL, 773684UL,  3148393UL, 3665338UL,
+    602594UL,  3992019UL, 107627UL,  1094809UL, 4165745UL, 3253395UL, 2713565UL, 1491342UL, 2449536UL, 723700UL,
+    3234284UL, 3767478UL, 2659003UL, 1976564UL, 1936937UL, 3083913UL, 1323662UL, 3729693UL, 3252338UL, 2355793UL,
+    2822788UL, 333279UL,  69458UL,   208829UL,  1420628UL, 2217640UL, 1370601UL, 1983475UL, 1820630UL, 900999UL,
+    2228131UL, 3014891UL, 3766222UL, 775102UL,  666768UL,  433670UL,  4178720UL, 3035399UL, 2984605UL, 1483276UL,
+    3517933UL, 4001733UL, 1197404UL, 2003198UL, 2807630UL, 1246772UL, 3758560UL, 2597622UL, 888400UL,  3418897UL,
+    1139946UL, 3528612UL, 4174874UL, 2084676UL, 2151401UL, 1281670UL, 2362898UL, 1691979UL, 2420100UL, 873018UL,
+    1791337UL, 299904UL,  2227861UL, 93819UL,   3540594UL, 2564336UL, 235705UL,  1470452UL, 2627576UL, 755828UL,
+    1854441UL, 3846696UL, 3036085UL, 1917029UL, 582169UL,  441053UL,  3060162UL, 2653344UL, 4185948UL, 3008112UL,
+    1261120UL, 452791UL,  171505UL,  1889346UL, 2858946UL, 148684UL,  2328844UL, 3675288UL, 2920094UL, 554128UL,
+    2661430UL, 861519UL,  3097561UL, 2380174UL, 765907UL,  4190557UL, 822081UL,  2817321UL, 2004710UL, 4150667UL,
+    4099314UL, 3820902UL, 4178145UL, 1143522UL, 2385191UL, 3077020UL, 3908944UL, 2764260UL, 2499235UL, 1579823UL,
+    341156UL,  2993302UL, 2298510UL, 3565887UL, 263133UL,  1039760UL, 4068928UL, 1406195UL, 2553826UL, 3516277UL,
+    1023316UL, 467040UL,  2550175UL, 3132333UL, 1584599UL, 474764UL,  2558223UL, 1045468UL, 1499058UL, 387038UL,
+    1320531UL, 1247999UL, 2655128UL, 2145263UL, 1145610UL, 41465UL,   2367397UL, 673459UL,  2758534UL, 1096211UL,
+    206203UL,  3662006UL, 1953348UL, 3435255UL, 3394787UL, 1381158UL, 81024UL,   2404548UL, 3762008UL, 3398577UL,
+    3478841UL, 2645992UL, 2536054UL, 2442158UL, 1741894UL, 4024807UL, 2641129UL, 4071124UL, 651906UL,  312857UL,
+    2336784UL, 4091050UL, 365162UL,  4169132UL, 506114UL,  2259754UL, 1551513UL, 109843UL,  1601271UL, 3510763UL,
+    2358378UL, 313033UL,  3643819UL, 1330373UL, 3929355UL, 3818604UL, 2701302UL, 2165240UL, 1453036UL, 2896111UL,
+    2920829UL, 2852418UL, 4073796UL, 2550842UL, 1629537UL, 802460UL,  1956296UL, 2462543UL, 3356024UL, 1252538UL,
+    1174874UL, 2696273UL, 1466949UL, 822978UL,  1939554UL, 2525245UL, 713767UL,  2582686UL, 274128UL,  1748611UL,
+    1616006UL, 3686552UL, 2822817UL, 2512599UL, 1923453UL, 1764347UL, 3846408UL, 1039695UL, 3294592UL, 3728242UL,
+    2395806UL, 239276UL,  2657779UL, 3356785UL, 2524830UL, 1626411UL, 2426574UL, 1796633UL, 2143475UL, 1866659UL,
+    917965UL,  284880UL,  2200740UL, 102349UL,  3278454UL, 1032542UL, 3443576UL, 325657UL,  2475397UL, 1095504UL,
+    2137233UL, 2098305UL, 2237207UL, 2703066UL, 2538256UL, 3459555UL, 2196187UL, 413278UL,  2788864UL, 1129646UL,
+    3380718UL, 3099086UL, 349948UL,  2988282UL, 3469342UL, 3098309UL, 2483084UL, 4006724UL, 2782902UL, 1188815UL,
+    1446712UL, 2506880UL, 4032642UL, 3783498UL, 3811498UL, 87092UL,   1884611UL, 3140271UL, 3392501UL, 2687100UL,
+    381361UL,  44447UL,   529511UL,  3353528UL, 3205851UL, 3178772UL, 2257518UL, 2468467UL, 441505UL,  1941105UL,
+    2204897UL, 61102UL,   262294UL,  630563UL,  904275UL,  225009UL,  2159676UL, 400429UL,  2200946UL, 3465048UL,
+    2565609UL, 1713695UL, 2276618UL, 1694021UL, 3964860UL, 4091192UL, 1108021UL, 2184673UL, 3354452UL, 2835965UL,
+    4137593UL, 2741458UL, 730977UL,  3946255UL, 2250442UL, 3084409UL, 2106153UL, 87176UL,   1692065UL, 955886UL,
+    2729329UL, 2056419UL, 1447894UL, 2784821UL, 1684414UL, 4011942UL, 3541300UL, 1596279UL, 2148426UL, 657195UL,
+    1568875UL, 3284589UL, 3243024UL, 2316791UL, 2245581UL, 2806669UL, 848016UL,  2475780UL, 1997413UL, 284387UL,
+    1911620UL, 735815UL,  2635871UL, 814451UL,  39444UL,   755433UL,  899738UL,  1858164UL, 215403UL,  1504331UL,
+    372907UL,  2524315UL, 3717932UL, 932234UL,  2098706UL, 1707394UL, 3541426UL, 782834UL,  3577010UL, 835003UL,
+    3210880UL, 2712593UL, 2869307UL, 1297369UL, 1433609UL, 4112483UL, 3167052UL, 2185537UL, 3575046UL, 2962057UL,
+    2375496UL, 3025741UL, 3690363UL, 1147503UL, 1500601UL, 444980UL,  2747903UL, 3434770UL, 4101670UL, 2347039UL,
+    4111507UL, 2780165UL, 1927818UL, 2225498UL, 398434UL,  2141806UL, 3124743UL, 3075213UL, 2480329UL, 657585UL,
+    1599127UL, 1261610UL, 2195889UL, 282915UL,  3255211UL, 2736155UL, 349863UL,  226106UL,  3391326UL, 3298815UL,
+    1748831UL, 1996033UL, 3627827UL, 3362999UL, 2362717UL, 3776216UL, 3947011UL, 181563UL,  3353188UL, 2580631UL,
+    3468826UL, 4080388UL, 2356002UL, 2689126UL, 528695UL,  1464645UL, 3416071UL, 108906UL,  1164839UL, 1244086UL,
+    2794102UL, 3233612UL, 3164887UL, 3619405UL, 2373277UL, 3792392UL, 1369142UL, 3655255UL, 392468UL,  1701518UL,
+    2634483UL, 3850951UL, 3901212UL, 724887UL,  826288UL,  2624876UL, 266949UL,  2133797UL, 3809286UL, 2578834UL,
+    636927UL,  3208807UL, 1093613UL, 3665227UL, 3566318UL, 2935393UL, 2375677UL, 3417702UL, 2751605UL, 3572328UL,
+    119998UL,  3959284UL, 653884UL,  3168575UL, 1739142UL, 1258462UL, 3530581UL, 2457973UL, 1892557UL, 4084604UL,
+    4117474UL, 3597320UL, 3608525UL, 190169UL,  3438205UL, 1581305UL, 3369578UL, 248212UL,  2130278UL, 3010609UL,
+    1414728UL, 2290043UL, 229205UL,  165024UL,  1741200UL, 4057686UL, 853288UL,  3384074UL, 1585147UL, 3779038UL,
+    2714490UL, 3870103UL, 3405069UL, 982085UL,  2027405UL, 1603217UL, 457048UL,  3462915UL, 2044436UL, 2881774UL,
+    1651588UL, 2334388UL, 3553215UL, 3093263UL, 2264072UL, 68778UL,   1950805UL, 1057323UL, 941216UL,  3682478UL,
+    735759UL,  2271518UL, 2857656UL, 3129864UL, 295104UL,  1031355UL, 4139345UL, 2967747UL, 3358601UL, 98229UL,
+    3572585UL, 879511UL,  1794402UL, 1211381UL, 2886926UL, 1096205UL, 232147UL,  604383UL,  10791UL,   3799358UL,
+    3689014UL, 917480UL,  3739806UL, 3789831UL, 868520UL,  330150UL,  2113900UL, 3631586UL, 2323123UL, 2424955UL,
+    1013762UL, 1644334UL, 2127392UL, 1537383UL, 1912663UL, 781913UL,  455631UL,  604079UL,  3651494UL, 25680UL,
+    2347331UL, 3499232UL, 1959731UL, 1303421UL, 130664UL,  3555377UL, 1783010UL, 1924342UL, 3618143UL, 1302043UL,
+    252529UL,  1146066UL, 3685952UL, 2338882UL, 2633473UL, 200434UL,  1116470UL, 3409462UL, 2393387UL, 1666172UL,
+    907502UL,  182000UL,  805586UL,  222944UL,  768825UL,  361453UL,  822432UL,  802542UL,  2803204UL, 1553367UL,
+    788806UL,  1786271UL, 348992UL,  3904167UL, 2728958UL, 591681UL,  1386326UL, 2536574UL, 3867282UL, 1019315UL,
+    2029397UL, 2711607UL, 4059183UL, 1483731UL, 1071058UL, 445675UL,  3049724UL, 1928019UL, 2155512UL, 2219695UL,
+    855151UL,  2997983UL, 3029297UL, 3214792UL, 4032513UL, 1251927UL, 2260714UL, 712942UL,  1123373UL, 3495585UL,
+    29613UL,   1715479UL, 3002935UL, 3452794UL, 1851733UL, 3199169UL, 1885815UL, 989547UL,  1203856UL, 3730711UL,
+    3625163UL, 2351081UL, 2975486UL, 234084UL,  2749876UL, 1817862UL, 1415194UL, 3298174UL, 437426UL,  1529431UL,
+    3699744UL, 368527UL,  2029560UL, 1574090UL, 2342520UL, 1009034UL, 452100UL,  640613UL,  365601UL,  1799823UL,
+    3203103UL, 1940762UL, 425288UL,  3211145UL, 1041667UL, 1241458UL, 563609UL,  314776UL,  1579728UL, 958709UL,
+    1275885UL, 1000787UL, 628051UL,  1551451UL, 514875UL,  3651962UL, 1714462UL, 495210UL,  1250619UL, 2396850UL,
+    1345461UL, 3343729UL, 110820UL,  3534807UL,
+};
+uint32_t rand_arr_23_b23_w32_arr[1024] = {
+    119144UL,  1760637UL, 7066929UL, 4904539UL, 7762071UL, 1391193UL, 3870813UL, 898189UL,  919926UL,  497622UL,
+    1878014UL, 6400305UL, 7554231UL, 3696779UL, 4089027UL, 6710927UL, 3872289UL, 8256523UL, 4412285UL, 7603880UL,
+    4266668UL, 7905989UL, 2491366UL, 1295964UL, 4185724UL, 2477329UL, 4549988UL, 945325UL,  3630512UL, 5547026UL,
+    4658634UL, 1375163UL, 5081032UL, 908460UL,  1851583UL, 889665UL,  6466124UL, 2732661UL, 7426651UL, 8027050UL,
+    1718812UL, 5051628UL, 4551526UL, 1306780UL, 1541617UL, 3182672UL, 3481929UL, 6928529UL, 5076343UL, 4966950UL,
+    2391244UL, 8037938UL, 1858463UL, 2131665UL, 4695722UL, 5349915UL, 1415685UL, 6123685UL, 4400428UL, 8070410UL,
+    2158145UL, 3424069UL, 6690986UL, 194420UL,  3737155UL, 4629547UL, 3602849UL, 8179265UL, 5091312UL, 8097355UL,
+    7641104UL, 5867499UL, 536476UL,  2335208UL, 5403220UL, 5995433UL, 5471023UL, 4224269UL, 4171579UL, 5226363UL,
+    3491084UL, 3119263UL, 5508106UL, 2877213UL, 7547726UL, 1848135UL, 2215573UL, 7792196UL, 2034769UL, 3421824UL,
+    7686459UL, 1952187UL, 4264516UL, 126190UL,  4666400UL, 1528344UL, 4331780UL, 276758UL,  3560360UL, 1124739UL,
+    4998279UL, 5591828UL, 3288852UL, 3362319UL, 8187278UL, 6806442UL, 2811800UL, 6209287UL, 5730813UL, 5215744UL,
+    2924229UL, 2101769UL, 27162UL,   7717723UL, 7400488UL, 7428332UL, 659441UL,  5171792UL, 7394641UL, 4832073UL,
+    6214840UL, 6592571UL, 3932962UL, 4930829UL, 5387368UL, 2296075UL, 7665695UL, 4985144UL, 551311UL,  4183025UL,
+    6884193UL, 1714381UL, 4498002UL, 2461868UL, 5349334UL, 1988543UL, 107411UL,  2912619UL, 151434UL,  8154505UL,
+    4657745UL, 2864582UL, 4098148UL, 7220936UL, 5814503UL, 1062740UL, 377356UL,  6076467UL, 291381UL,  2984291UL,
+    296371UL,  5326618UL, 7938223UL, 2262489UL, 7387206UL, 7094682UL, 8136774UL, 6373347UL, 3561778UL, 3407498UL,
+    6566276UL, 6789496UL, 6632759UL, 8208766UL, 6985410UL, 2741475UL, 7086015UL, 5286545UL, 6490394UL, 1263505UL,
+    4412838UL, 3841495UL, 2225407UL, 1093437UL, 3383227UL, 3919304UL, 2498032UL, 6774786UL, 2622802UL, 5334162UL,
+    3813655UL, 1473895UL, 2770305UL, 893271UL,  5148011UL, 7523463UL, 5144199UL, 3866236UL, 6165686UL, 1764540UL,
+    1077309UL, 7769211UL, 1893005UL, 6713188UL, 6373079UL, 216435UL,  2448999UL, 905029UL,  3903685UL, 2425612UL,
+    597789UL,  7805651UL, 1640074UL, 7897365UL, 750344UL,  3440997UL, 6063251UL, 7345122UL, 111278UL,  4594822UL,
+    2761716UL, 5742294UL, 7608885UL, 907924UL,  3146409UL, 6896451UL, 2504254UL, 1451486UL, 7026254UL, 2133015UL,
+    7338699UL, 2201853UL, 6434426UL, 2098272UL, 3704961UL, 1335796UL, 1323218UL, 8155211UL, 2983516UL, 5918863UL,
+    3615859UL, 693145UL,  443924UL,  4689025UL, 4951368UL, 1534219UL, 76578UL,   6046746UL, 7489510UL, 5034271UL,
+    8306637UL, 4304475UL, 6530596UL, 7727802UL, 7949686UL, 5153228UL, 6080858UL, 1876800UL, 4260239UL, 3317857UL,
+    8059793UL, 5219169UL, 8285669UL, 3838944UL, 3998307UL, 5800513UL, 4123700UL, 5109622UL, 1670254UL, 7176435UL,
+    5828005UL, 3601000UL, 1417808UL, 3692576UL, 2291154UL, 3736687UL, 2382123UL, 6694493UL, 2725958UL, 8079229UL,
+    4663746UL, 3120477UL, 5888169UL, 1037486UL, 4821314UL, 8369996UL, 4063284UL, 7068451UL, 2519866UL, 7040610UL,
+    3716149UL, 628930UL,  5467761UL, 7239849UL, 5970050UL, 1941552UL, 5909268UL, 366869UL,  6681738UL, 7376614UL,
+    2877238UL, 51856UL,   6783680UL, 1051686UL, 5875675UL, 7081059UL, 1131796UL, 5893059UL, 4624995UL, 7277969UL,
+    2740329UL, 64914UL,   1357975UL, 1112696UL, 3797430UL, 3870249UL, 3083706UL, 7699330UL, 183456UL,  6033139UL,
+    794777UL,  6550267UL, 5440496UL, 3937132UL, 3074775UL, 4789173UL, 2540923UL, 588329UL,  2268788UL, 5607569UL,
+    7044270UL, 315234UL,  120058UL,  2296984UL, 7354325UL, 2993921UL, 5402019UL, 2398976UL, 4533406UL, 5119255UL,
+    5679387UL, 3915046UL, 4750556UL, 2826550UL, 3103089UL, 2571829UL, 685390UL,  8232526UL, 1348726UL, 1040334UL,
+    4261494UL, 212291UL,  6505885UL, 4162638UL, 1175424UL, 2275270UL, 3859804UL, 1806184UL, 7843932UL, 4835645UL,
+    1477491UL, 4270311UL, 3846319UL, 5749928UL, 5866721UL, 2969860UL, 8032672UL, 5545368UL, 1258046UL, 2899579UL,
+    3017450UL, 4808899UL, 1390483UL, 6021430UL, 7375815UL, 8068426UL, 3368050UL, 4835326UL, 2804801UL, 6257373UL,
+    6860221UL, 6503303UL, 2861801UL, 5304811UL, 1026087UL, 3002375UL, 7137748UL, 6364006UL, 6464364UL, 5740757UL,
+    5536213UL, 6917520UL, 5736580UL, 2567509UL, 2259471UL, 930166UL,  7663329UL, 7425208UL, 2333559UL, 2006905UL,
+    4222845UL, 7317800UL, 7927339UL, 1011497UL, 5370673UL, 5270585UL, 6472083UL, 7560145UL, 1995318UL, 6389900UL,
+    276853UL,  7011069UL, 985724UL,  1075692UL, 1523916UL, 5937380UL, 1010440UL, 1909042UL, 2275979UL, 6746413UL,
+    4220041UL, 7397391UL, 706967UL,  6056316UL, 4057311UL, 7265270UL, 1291315UL, 2400623UL, 2247133UL, 5840112UL,
+    2556523UL, 4086744UL, 4864374UL, 5538169UL, 5097201UL, 3627814UL, 7580747UL, 6439215UL, 2474686UL, 7142454UL,
+    767698UL,  523069UL,  1932392UL, 2834536UL, 3216357UL, 3618447UL, 3009242UL, 6994838UL, 4515656UL, 3510488UL,
+    4122009UL, 4291024UL, 6015433UL, 999378UL,  5378152UL, 3774531UL, 4732479UL, 4808265UL, 936712UL,  7572602UL,
+    209033UL,  218008UL,  7518383UL, 6398171UL, 3745847UL, 2473025UL, 7091716UL, 4874795UL, 3265553UL, 8311256UL,
+    984549UL,  7128405UL, 3507019UL, 6353518UL, 1134819UL, 1840285UL, 76097UL,   5273397UL, 6668364UL, 808306UL,
+    6792012UL, 7756189UL, 7682989UL, 6758207UL, 1806001UL, 1203201UL, 6320079UL, 6295040UL, 6310801UL, 2181778UL,
+    7254595UL, 2458711UL, 2574945UL, 4571905UL, 4019193UL, 6911681UL, 2260137UL, 114672UL,  7195150UL, 8176858UL,
+    7553757UL, 3294338UL, 5729481UL, 1332926UL, 4682984UL, 5245642UL, 3401520UL, 8175885UL, 7093921UL, 6818330UL,
+    7003437UL, 6134283UL, 501484UL,  6809262UL, 4653669UL, 1503704UL, 6375047UL, 6243501UL, 7414140UL, 7182359UL,
+    383226UL,  2393849UL, 1612348UL, 5721069UL, 7722417UL, 159517UL,  3700207UL, 3825512UL, 81319UL,   5148159UL,
+    2178424UL, 7260620UL, 4495791UL, 6562402UL, 8155636UL, 1480919UL, 8195642UL, 4467988UL, 5784372UL, 4636248UL,
+    7480130UL, 2924564UL, 5410513UL, 6999620UL, 7789886UL, 3333351UL, 2304493UL, 875275UL,  2481102UL, 6340113UL,
+    2935725UL, 3855475UL, 6072541UL, 5283390UL, 8314959UL, 7710758UL, 5480421UL, 7332289UL, 347532UL,  6883998UL,
+    7459487UL, 3202009UL, 7466838UL, 5494737UL, 8317210UL, 3240774UL, 7318323UL, 6853838UL, 7165928UL, 4761574UL,
+    3574915UL, 7171037UL, 2713434UL, 4552297UL, 223322UL,  3873559UL, 2321707UL, 7112436UL, 7321038UL, 4283936UL,
+    2373085UL, 1108602UL, 548565UL,  6611679UL, 7172346UL, 2971696UL, 5192984UL, 2905647UL, 810313UL,  2766309UL,
+    4709037UL, 1993685UL, 5732684UL, 5262922UL, 4250961UL, 7136484UL, 3875156UL, 3838138UL, 5372105UL, 6811691UL,
+    3534463UL, 676464UL,  5310576UL, 5597699UL, 3322777UL, 3013911UL, 3946292UL, 7967009UL, 4108685UL, 4193255UL,
+    457341UL,  4114902UL, 689249UL,  3550952UL, 803109UL,  1454746UL, 3222363UL, 498195UL,  4569385UL, 1765532UL,
+    4245146UL, 7064026UL, 719297UL,  6663805UL, 4332959UL, 5148368UL, 7728074UL, 7120569UL, 2003967UL, 5508532UL,
+    66471UL,   1542490UL, 7603588UL, 8159834UL, 3738284UL, 7394970UL, 1645644UL, 7203400UL, 6755590UL, 3302660UL,
+    2765453UL, 2618008UL, 1165655UL, 4639476UL, 4796719UL, 6394990UL, 6361370UL, 7836338UL, 2208524UL, 595182UL,
+    7299500UL, 7475430UL, 3291263UL, 2587522UL, 1165969UL, 2107674UL, 7826773UL, 3916635UL, 8151018UL, 6349004UL,
+    2157383UL, 1892069UL, 7863649UL, 6390622UL, 7713411UL, 1852921UL, 1843651UL, 846289UL,  979988UL,  5335971UL,
+    553769UL,  4272970UL, 2216154UL, 2916163UL, 5202348UL, 5579611UL, 2209655UL, 4739790UL, 7241109UL, 5082720UL,
+    7454284UL, 95584UL,   1820869UL, 2223793UL, 8053517UL, 1880162UL, 1620214UL, 6533632UL, 6099748UL, 1389802UL,
+    5704887UL, 6877459UL, 1892289UL, 2219407UL, 7275324UL, 353473UL,  3368307UL, 1548125UL, 4931719UL, 1029651UL,
+    7642332UL, 2196724UL, 507251UL,  2162297UL, 1263433UL, 2323274UL, 2540474UL, 4227195UL, 1075203UL, 5403819UL,
+    4155017UL, 6426801UL, 3633578UL, 6376511UL, 7827415UL, 283799UL,  3165855UL, 2189775UL, 2684256UL, 6746451UL,
+    4336154UL, 1007336UL, 5041314UL, 1493247UL, 5208567UL, 3316567UL, 6417616UL, 481512UL,  3007033UL, 6446760UL,
+    1547583UL, 8294050UL, 7593861UL, 4742866UL, 1031221UL, 8013244UL, 5819969UL, 5494141UL, 4979717UL, 3689574UL,
+    4047622UL, 7572978UL, 6065731UL, 1413867UL, 1953563UL, 3086418UL, 606661UL,  3757245UL, 1995089UL, 557751UL,
+    7172392UL, 1630768UL, 7904202UL, 3312672UL, 4951399UL, 5300027UL, 6783914UL, 7567784UL, 1514325UL, 1630119UL,
+    988981UL,  5072832UL, 4023750UL, 5792032UL, 935469UL,  196451UL,  2113837UL, 5405561UL, 3187765UL, 836713UL,
+    1942779UL, 3082829UL, 1275986UL, 476440UL,  4707925UL, 3285882UL, 4609695UL, 5785120UL, 7644634UL, 5572383UL,
+    1465787UL, 484583UL,  3072348UL, 5435492UL, 2271567UL, 1097243UL, 1563961UL, 8386389UL, 3656428UL, 1672322UL,
+    2465319UL, 155994UL,  4081786UL, 6321758UL, 7394090UL, 1544807UL, 1048037UL, 4510583UL, 2353354UL, 3075177UL,
+    4260900UL, 6420919UL, 6312817UL, 4560633UL, 3680257UL, 2893147UL, 6454062UL, 7174270UL, 1204513UL, 6629394UL,
+    3967242UL, 2062106UL, 7408280UL, 7776350UL, 355286UL,  871666UL,  5067146UL, 5460152UL, 5373693UL, 6759034UL,
+    7345101UL, 4897885UL, 787635UL,  7585728UL, 7624610UL, 4801120UL, 3083437UL, 3502136UL, 4553608UL, 1266615UL,
+    8042349UL, 57336UL,   1474959UL, 5411566UL, 5915690UL, 1132546UL, 1277773UL, 1219405UL, 7357275UL, 920672UL,
+    3178868UL, 1111567UL, 7685763UL, 5489436UL, 1274207UL, 212614UL,  6423167UL, 1414929UL, 3311856UL, 650608UL,
+    4847241UL, 7321362UL, 6177605UL, 876536UL,  5267292UL, 336988UL,  5482413UL, 7657707UL, 7256272UL, 3820972UL,
+    4398453UL, 5719590UL, 7522512UL, 3122042UL, 2856524UL, 1980664UL, 3231894UL, 144558UL,  877253UL,  8367811UL,
+    3502932UL, 3345686UL, 3953305UL, 4461254UL, 2182972UL, 8117512UL, 6138224UL, 6146122UL, 7677698UL, 2572858UL,
+    7203771UL, 4973092UL, 4477085UL, 8283066UL, 4736718UL, 4614747UL, 8076151UL, 3894761UL, 6652130UL, 4464608UL,
+    3626507UL, 4845406UL, 8030640UL, 3823199UL, 3497659UL, 5235572UL, 5640531UL, 7915330UL, 7204483UL, 2933044UL,
+    3042119UL, 4076686UL, 5978393UL, 6098147UL, 4687500UL, 4896473UL, 1250373UL, 1537315UL, 5480859UL, 2552413UL,
+    4181284UL, 1441129UL, 3957940UL, 4752825UL, 3733596UL, 2873973UL, 7404994UL, 5259106UL, 6876504UL, 4839349UL,
+    4657114UL, 2590349UL, 4312283UL, 4597806UL, 4898806UL, 6918989UL, 2026998UL, 4299283UL, 8187141UL, 4471347UL,
+    5947261UL, 6719053UL, 5747600UL, 964015UL,  4108825UL, 7088739UL, 7980487UL, 707654UL,  2815326UL, 56357UL,
+    1193188UL, 4124737UL, 5484822UL, 2860852UL, 771030UL,  2656128UL, 7467443UL, 5107291UL, 5829450UL, 8217629UL,
+    1446262UL, 487991UL,  7792666UL, 4229232UL, 869215UL,  7185628UL, 7195201UL, 6273961UL, 268071UL,  631016UL,
+    2707816UL, 3042862UL, 3867186UL, 7813777UL, 4676094UL, 6387680UL, 4187269UL, 1659980UL, 6654UL,    4507465UL,
+    6695561UL, 8309858UL, 2918570UL, 1918927UL, 860841UL,  1678504UL, 3408200UL, 7902353UL, 3591926UL, 6608426UL,
+    3362655UL, 1698119UL, 2616961UL, 1335523UL, 2303295UL, 1265974UL, 8384438UL, 4165926UL, 8062168UL, 5255144UL,
+    7423608UL, 7440589UL, 845410UL,  380530UL,  1193031UL, 3312418UL, 8270089UL, 169751UL,  871903UL,  4161556UL,
+    7671836UL, 3168059UL, 2466350UL, 4463158UL, 7088228UL, 7033530UL, 7555005UL, 98316UL,   5284834UL, 762820UL,
+    6318877UL, 7733732UL, 7382839UL, 8237013UL, 1216767UL, 2562572UL, 241776UL,  3183273UL, 635492UL,  4322255UL,
+    98930UL,   7191126UL, 2493668UL, 4790090UL, 2279371UL, 2018689UL, 1498654UL, 746354UL,  6033876UL, 1351799UL,
+    2426855UL, 3735817UL, 7632927UL, 2452951UL,
+};
+uint32_t rand_arr_24_b24_w32_arr[1024] = {
+    1996915UL,  6430884UL,  10326265UL, 760105UL,   15279859UL, 13649391UL, 552029UL,   12850541UL, 161075UL,
+    16732488UL, 7258013UL,  5738042UL,  12772788UL, 15249973UL, 6000545UL,  11185701UL, 11346081UL, 2773212UL,
+    12663982UL, 12944610UL, 1772959UL,  14309749UL, 11603623UL, 7091127UL,  10285569UL, 15984698UL, 495603UL,
+    3721984UL,  14242579UL, 10017469UL, 16609948UL, 9750677UL,  7341244UL,  16281299UL, 8962950UL,  13807684UL,
+    11729644UL, 16168418UL, 4764334UL,  9832248UL,  11725561UL, 15261836UL, 7278930UL,  11631026UL, 6456430UL,
+    2526089UL,  3944263UL,  13449701UL, 4344860UL,  10055332UL, 9497789UL,  12275351UL, 9921162UL,  14309297UL,
+    12403048UL, 14131383UL, 7573015UL,  14508143UL, 14407058UL, 13782679UL, 12417809UL, 10409100UL, 8458826UL,
+    970811UL,   11415188UL, 8928559UL,  6468326UL,  16034452UL, 14966373UL, 5929620UL,  7374811UL,  2612454UL,
+    12810496UL, 6396986UL,  9716936UL,  1229606UL,  14855471UL, 15969709UL, 5018441UL,  15981783UL, 12780689UL,
+    8977497UL,  9722074UL,  16285101UL, 16169055UL, 12029208UL, 4643634UL,  2761747UL,  9297652UL,  9915764UL,
+    1175578UL,  16244372UL, 16613543UL, 5382644UL,  3265113UL,  1778209UL,  9305724UL,  11065378UL, 3827344UL,
+    3374037UL,  7186890UL,  16642769UL, 11292964UL, 13442252UL, 1533677UL,  5182319UL,  11909412UL, 10310879UL,
+    14743639UL, 105651UL,   8350691UL,  16521689UL, 7158248UL,  633840UL,   8808081UL,  14715858UL, 5358701UL,
+    14472525UL, 14937774UL, 9318273UL,  9020706UL,  443622UL,   9565651UL,  16437091UL, 2808865UL,  5416383UL,
+    15434790UL, 7672254UL,  14308354UL, 8751220UL,  16734904UL, 6863018UL,  6752544UL,  15689089UL, 13264968UL,
+    5118538UL,  2004347UL,  10150453UL, 10431202UL, 5563348UL,  871310UL,   6664447UL,  12247468UL, 6296699UL,
+    9561023UL,  3552346UL,  5631938UL,  7402481UL,  11222044UL, 2550146UL,  16023548UL, 5904504UL,  5072951UL,
+    8349307UL,  11845045UL, 9429246UL,  10760500UL, 16520466UL, 5889550UL,  15399268UL, 7660215UL,  13402602UL,
+    11646344UL, 8260636UL,  14141807UL, 11645040UL, 10732954UL, 531147UL,   10933757UL, 7436967UL,  10121894UL,
+    15397178UL, 8784340UL,  4990429UL,  11171174UL, 11146309UL, 153449UL,   2815855UL,  13092355UL, 581944UL,
+    15561275UL, 2769759UL,  10807327UL, 7549756UL,  15650358UL, 15929744UL, 8455938UL,  1220355UL,  2941441UL,
+    9044704UL,  11945008UL, 7477777UL,  13363183UL, 4185965UL,  12998374UL, 5439260UL,  13355151UL, 4456097UL,
+    14793380UL, 15778230UL, 3310189UL,  2463643UL,  4922155UL,  7868511UL,  16760039UL, 11430095UL, 2703920UL,
+    14629272UL, 12582110UL, 12264629UL, 2705605UL,  4532515UL,  9189028UL,  15163546UL, 7566504UL,  3268524UL,
+    9863503UL,  12438529UL, 11217547UL, 7215278UL,  1609884UL,  6277660UL,  5060143UL,  2840936UL,  1982746UL,
+    12371056UL, 10667611UL, 341163UL,   7361125UL,  11567484UL, 4829305UL,  10243820UL, 5582498UL,  7800207UL,
+    11347333UL, 13281375UL, 15806256UL, 8629183UL,  2972417UL,  11400617UL, 5968983UL,  5970955UL,  10177667UL,
+    3954316UL,  15559206UL, 14559037UL, 1110247UL,  10565336UL, 13604856UL, 1278061UL,  2585322UL,  16742357UL,
+    7770429UL,  3412861UL,  10765800UL, 11616032UL, 10841532UL, 14606790UL, 16299732UL, 14286548UL, 11265196UL,
+    7730374UL,  9032915UL,  2435454UL,  8155282UL,  16439269UL, 1002231UL,  9392508UL,  1859467UL,  16119620UL,
+    1501590UL,  8048131UL,  5072911UL,  290926UL,   3872836UL,  1169628UL,  1368307UL,  723900UL,   1781021UL,
+    1720090UL,  12884338UL, 5720569UL,  11503127UL, 11561852UL, 11195263UL, 8801203UL,  5305938UL,  5401039UL,
+    14619565UL, 1010005UL,  10597476UL, 9344051UL,  12796300UL, 9520886UL,  16297472UL, 12428068UL, 10219211UL,
+    7883363UL,  15629519UL, 2316451UL,  3576002UL,  9816618UL,  6448025UL,  12054841UL, 15371800UL, 9220542UL,
+    11014739UL, 13129117UL, 6649529UL,  6972055UL,  13508375UL, 15718000UL, 1782149UL,  6813226UL,  3808016UL,
+    11414301UL, 9012771UL,  12703185UL, 2848766UL,  7913896UL,  8276950UL,  10538173UL, 234051UL,   6690085UL,
+    4590770UL,  7428420UL,  11923760UL, 4469643UL,  1232779UL,  3133948UL,  14906900UL, 5962386UL,  14491684UL,
+    13657825UL, 8348854UL,  13837029UL, 10098651UL, 11222821UL, 15177054UL, 11773080UL, 8559952UL,  10331290UL,
+    1149511UL,  3509089UL,  1322957UL,  2014506UL,  9284709UL,  6520585UL,  9315777UL,  11987749UL, 3199506UL,
+    1207754UL,  5299233UL,  12929986UL, 755734UL,   6918959UL,  7922646UL,  7061461UL,  16049735UL, 12868892UL,
+    1006790UL,  6985155UL,  11022487UL, 3086418UL,  10786272UL, 4673122UL,  2842656UL,  16550864UL, 807602UL,
+    11360811UL, 15395078UL, 5022086UL,  125019UL,   6483080UL,  2450150UL,  13264716UL, 7939358UL,  16107608UL,
+    13844570UL, 1619052UL,  1154898UL,  15056828UL, 12325657UL, 1502135UL,  12186469UL, 11762490UL, 4088758UL,
+    9359834UL,  9348673UL,  15659658UL, 3613428UL,  4925045UL,  5954650UL,  2597856UL,  5237479UL,  5830485UL,
+    6822277UL,  15516883UL, 4605583UL,  14802269UL, 2413710UL,  8892024UL,  11193711UL, 6992873UL,  1056865UL,
+    6879308UL,  15513537UL, 10995193UL, 7679319UL,  3071240UL,  14427060UL, 10345178UL, 11708068UL, 14974008UL,
+    2969478UL,  3886714UL,  5172760UL,  8735097UL,  11426621UL, 13696238UL, 2840365UL,  12254595UL, 6724118UL,
+    10199683UL, 13183760UL, 10525561UL, 12424189UL, 5288378UL,  12494810UL, 15238343UL, 15930606UL, 4890981UL,
+    4342445UL,  14288259UL, 5000243UL,  14229281UL, 3735728UL,  13705606UL, 2330505UL,  10687212UL, 10776563UL,
+    816405UL,   15586000UL, 13539756UL, 2543230UL,  2436144UL,  3853116UL,  10925820UL, 14291650UL, 5662680UL,
+    5221810UL,  12720176UL, 12281280UL, 13870237UL, 2946479UL,  10750867UL, 4377506UL,  16039389UL, 13561705UL,
+    4817304UL,  289592UL,   11075711UL, 12050934UL, 12772916UL, 8743789UL,  16321615UL, 2035640UL,  6168244UL,
+    5687519UL,  15755025UL, 4809807UL,  6909358UL,  4132884UL,  13641610UL, 7588386UL,  12381717UL, 1234043UL,
+    1819158UL,  16077703UL, 9056921UL,  7526548UL,  12354688UL, 1819362UL,  5683879UL,  1490071UL,  12033809UL,
+    2147206UL,  7052488UL,  8655948UL,  8690289UL,  15376905UL, 1384550UL,  5831001UL,  816635UL,   1260824UL,
+    2721830UL,  14472089UL, 7433701UL,  5561596UL,  12235685UL, 9582126UL,  14415431UL, 14224104UL, 11315743UL,
+    1244085UL,  10549572UL, 11638539UL, 6816670UL,  2686848UL,  15786618UL, 9321582UL,  2014328UL,  998818UL,
+    2029216UL,  15852088UL, 176375UL,   16752167UL, 16188323UL, 12976542UL, 9458383UL,  16208943UL, 16703364UL,
+    2064636UL,  11178356UL, 13487039UL, 1685134UL,  1075263UL,  4286510UL,  15646401UL, 9605256UL,  16198198UL,
+    12345474UL, 1330782UL,  11481002UL, 10614957UL, 13318UL,    10958444UL, 14852533UL, 4981195UL,  6926522UL,
+    14152549UL, 15585634UL, 5222916UL,  5076583UL,  227959UL,   6790639UL,  14895752UL, 15416668UL, 9856042UL,
+    10755868UL, 1394948UL,  5738315UL,  11887033UL, 5728911UL,  14509972UL, 2889506UL,  443900UL,   3509486UL,
+    2882695UL,  14081146UL, 16214549UL, 4051027UL,  15255911UL, 10662843UL, 9375582UL,  723484UL,   9209245UL,
+    13314800UL, 3687064UL,  16598319UL, 11966261UL, 15520067UL, 8981825UL,  14873836UL, 10449120UL, 7090029UL,
+    12922020UL, 1666654UL,  3315043UL,  12636169UL, 2582931UL,  6170202UL,  4605207UL,  12750787UL, 391892UL,
+    4682876UL,  11677806UL, 6426077UL,  8944873UL,  15820253UL, 5564759UL,  11878799UL, 9877757UL,  5789518UL,
+    2829237UL,  3255564UL,  8211099UL,  15171167UL, 3294142UL,  1811105UL,  6058107UL,  10979880UL, 424149UL,
+    13658915UL, 15065392UL, 941721UL,   4852917UL,  9698477UL,  404651UL,   1494020UL,  94894UL,    5353607UL,
+    3354595UL,  2345614UL,  7145905UL,  9805809UL,  16537042UL, 6230477UL,  13959332UL, 13538129UL, 10678806UL,
+    1372038UL,  14916109UL, 12191910UL, 6644074UL,  11299358UL, 10186427UL, 12928850UL, 4157726UL,  5090621UL,
+    12949336UL, 8693829UL,  2030178UL,  4270137UL,  10247033UL, 4597681UL,  12967340UL, 14224687UL, 8354697UL,
+    567070UL,   12152944UL, 12821450UL, 8526720UL,  15225477UL, 10537540UL, 12978951UL, 7437865UL,  1252584UL,
+    16579393UL, 6245125UL,  878652UL,   6479165UL,  7403184UL,  14561589UL, 14190837UL, 1679526UL,  12321260UL,
+    5203666UL,  12594063UL, 4504560UL,  6812366UL,  11052653UL, 12307394UL, 1095869UL,  5425965UL,  12021088UL,
+    9965508UL,  15972141UL, 3620399UL,  1754513UL,  6838626UL,  14690728UL, 4778147UL,  1766180UL,  4503261UL,
+    16474324UL, 13685826UL, 13886386UL, 3902539UL,  15946931UL, 13272151UL, 1140546UL,  7576982UL,  1045558UL,
+    15013413UL, 3106484UL,  8948644UL,  11250208UL, 16353124UL, 16577878UL, 5655849UL,  3907453UL,  7605992UL,
+    12949800UL, 15565750UL, 9626282UL,  5359665UL,  13825711UL, 4215529UL,  7016138UL,  5567807UL,  4999353UL,
+    6093040UL,  16618680UL, 598171UL,   14994009UL, 4880669UL,  2366293UL,  11005088UL, 11779288UL, 2883521UL,
+    15307442UL, 15007280UL, 8846160UL,  6231869UL,  2412313UL,  6167619UL,  11485610UL, 1350356UL,  5731017UL,
+    5585905UL,  4427098UL,  1099652UL,  3013860UL,  2147438UL,  9788643UL,  13884489UL, 3708033UL,  4855565UL,
+    13066991UL, 10975865UL, 15750922UL, 4463225UL,  7221810UL,  8043648UL,  12843234UL, 15986330UL, 12195307UL,
+    3494566UL,  3886907UL,  8112648UL,  1091943UL,  12781063UL, 577395UL,   942215UL,   4126511UL,  13214266UL,
+    11837335UL, 9124033UL,  10575328UL, 14588688UL, 9707367UL,  6267156UL,  7705336UL,  16404829UL, 255714UL,
+    6007431UL,  337955UL,   14232895UL, 4343517UL,  3614976UL,  5045684UL,  8072825UL,  2882104UL,  15461921UL,
+    16304123UL, 10185986UL, 20001UL,    5373483UL,  1606022UL,  9828602UL,  5941366UL,  5959288UL,  10105595UL,
+    9050971UL,  15253774UL, 14804982UL, 11855417UL, 15746117UL, 10291788UL, 12808788UL, 15401868UL, 138642UL,
+    9043066UL,  5793925UL,  13870235UL, 10645975UL, 12959141UL, 9407629UL,  16544386UL, 14356916UL, 2459207UL,
+    8655158UL,  5189076UL,  10128809UL, 2134208UL,  2742069UL,  3725989UL,  8019652UL,  8918597UL,  12445900UL,
+    2639585UL,  16142555UL, 4023714UL,  4285563UL,  14029721UL, 9988991UL,  3531220UL,  15457664UL, 16681028UL,
+    2575405UL,  13985249UL, 14254127UL, 9525296UL,  2341810UL,  10870453UL, 12068418UL, 13790841UL, 8221660UL,
+    7779626UL,  5845094UL,  1181897UL,  1093585UL,  1480677UL,  1226571UL,  14530448UL, 14220264UL, 4851651UL,
+    11591271UL, 2420785UL,  2703959UL,  10662100UL, 12433032UL, 3840903UL,  6260706UL,  14851736UL, 9899021UL,
+    7255251UL,  5811820UL,  10085280UL, 10710713UL, 1126586UL,  2561082UL,  16393122UL, 14734670UL, 13951850UL,
+    10612790UL, 48613UL,    4946967UL,  5197942UL,  4533005UL,  547316UL,   11089571UL, 15758876UL, 6166713UL,
+    5680359UL,  14200097UL, 10187445UL, 13358592UL, 2405318UL,  4711550UL,  16489216UL, 8561768UL,  8940242UL,
+    6280309UL,  13682829UL, 1438685UL,  4081182UL,  4686263UL,  5735497UL,  14266888UL, 11708361UL, 10856514UL,
+    12675585UL, 7631112UL,  3035397UL,  16347260UL, 13083007UL, 11058596UL, 2471914UL,  4523423UL,  10065501UL,
+    7780093UL,  9047398UL,  4755174UL,  10102637UL, 9148884UL,  7849433UL,  5690398UL,  3195574UL,  772268UL,
+    3244039UL,  4023462UL,  10128528UL, 9682310UL,  2584512UL,  16507850UL, 3192440UL,  13712696UL, 6279668UL,
+    10946068UL, 7174971UL,  5071779UL,  13872295UL, 10411943UL, 1462931UL,  9111240UL,  1098583UL,  3373123UL,
+    8930222UL,  16348242UL, 4792558UL,  16380683UL, 6814049UL,  16025894UL, 8111723UL,  3472933UL,  8678347UL,
+    1037708UL,  5915332UL,  13921874UL, 14686987UL, 2752661UL,  7399974UL,  11394547UL, 7495073UL,  15379293UL,
+    7042605UL,  1764839UL,  1562719UL,  2218004UL,  15608770UL, 16022614UL, 4104067UL,  5933279UL,  10123555UL,
+    2210448UL,  1216397UL,  1748453UL,  9490528UL,  11525926UL, 6418185UL,  825073UL,   2387229UL,  5199800UL,
+    141656UL,   12897702UL, 8483273UL,  6888436UL,  8222812UL,  12227016UL, 3116551UL,  5906338UL,  2805089UL,
+    6058211UL,  5229401UL,  1806555UL,  15314787UL, 15752762UL, 6918901UL,  10040292UL, 912542UL,   337796UL,
+    7442458UL,  1506798UL,  15638813UL, 33371UL,    4429871UL,  14863524UL, 10702801UL, 16393038UL, 7245211UL,
+    2171044UL,  920601UL,   13375878UL, 4107382UL,  5429941UL,  14291735UL, 1456012UL,  2912249UL,  15648585UL,
+    14654820UL, 3695059UL,  7163008UL,  10052764UL, 3698465UL,  5590905UL,  15985244UL, 10516799UL, 14250672UL,
+    3037602UL,  3513798UL,  1595395UL,  1765453UL,  4602815UL,  8888214UL,  10789853UL, 10681296UL, 9560859UL,
+    9361212UL,  9566130UL,  8933256UL,  9181166UL,  5332876UL,  1334910UL,  11985355UL, 5079684UL,  5750632UL,
+    12077862UL, 7528990UL,  8473893UL,  2359619UL,  13967798UL, 15573024UL, 15216944UL, 990066UL,   7726801UL,
+    13599518UL, 1095290UL,  9128591UL,  2888100UL,  16543222UL, 3088144UL,  8749353UL,
+};
+uint32_t rand_arr_25_b25_w32_arr[1024] = {
+    4508223UL,  981019UL,   25526237UL, 5508351UL,  25012310UL, 16846853UL, 23021205UL, 32224132UL, 4362882UL,
+    25782209UL, 22997626UL, 27229243UL, 22479995UL, 14749365UL, 28886993UL, 32892684UL, 33196287UL, 14063164UL,
+    25542058UL, 27031915UL, 31911366UL, 18517198UL, 29625850UL, 10175502UL, 26656718UL, 26082347UL, 9860593UL,
+    3141154UL,  13542041UL, 32409871UL, 1070697UL,  5028504UL,  7957874UL,  33505111UL, 30764571UL, 30940116UL,
+    2774666UL,  22495917UL, 17340437UL, 19648962UL, 3697434UL,  23521055UL, 13122962UL, 5017198UL,  20774409UL,
+    10637045UL, 23877332UL, 28727894UL, 8560123UL,  20648584UL, 9876492UL,  26977160UL, 2603665UL,  27433885UL,
+    33161601UL, 3785932UL,  29965323UL, 13085190UL, 10582486UL, 9793618UL,  31836089UL, 30731112UL, 28455318UL,
+    30851076UL, 1805657UL,  60983UL,    21550697UL, 24579178UL, 5500208UL,  6545575UL,  14700069UL, 19653165UL,
+    20808930UL, 3735436UL,  9123850UL,  19619954UL, 19277402UL, 3043216UL,  2700396UL,  30010284UL, 30533821UL,
+    14645663UL, 2901375UL,  22360602UL, 23496745UL, 28784088UL, 2810784UL,  10361473UL, 24142681UL, 16830750UL,
+    31820993UL, 20282780UL, 20239123UL, 2005031UL,  26178017UL, 8180538UL,  32083335UL, 7376090UL,  10682585UL,
+    84584UL,    32872454UL, 14591650UL, 14750738UL, 478480UL,   20577939UL, 18974572UL, 33440484UL, 27922138UL,
+    12339933UL, 27914434UL, 6651392UL,  1040286UL,  31181570UL, 1241250UL,  17767959UL, 31230373UL, 2955014UL,
+    8450186UL,  14099823UL, 25446482UL, 2086610UL,  3614955UL,  30478101UL, 32672518UL, 1161587UL,  22903312UL,
+    14120885UL, 18368868UL, 21235164UL, 2891724UL,  8936003UL,  27518129UL, 5099431UL,  14643113UL, 3663417UL,
+    11564566UL, 21985317UL, 29478481UL, 28579622UL, 22462886UL, 23426377UL, 29057335UL, 3185002UL,  18732744UL,
+    33533041UL, 33279313UL, 20975853UL, 31333789UL, 9619681UL,  27633526UL, 5812434UL,  5059593UL,  7295967UL,
+    14656648UL, 2635233UL,  17009030UL, 25879780UL, 21859221UL, 22028772UL, 12063349UL, 2191142UL,  10701857UL,
+    26612906UL, 27779979UL, 17295125UL, 32615241UL, 21540174UL, 2674994UL,  3695904UL,  7959089UL,  5840152UL,
+    32076096UL, 12204394UL, 10304747UL, 30233559UL, 31587621UL, 18881068UL, 29306723UL, 18240484UL, 25319609UL,
+    29674357UL, 5282401UL,  1064298UL,  18818215UL, 8564848UL,  21009321UL, 23379473UL, 26823824UL, 26782939UL,
+    1012264UL,  30717286UL, 11256692UL, 12370842UL, 32614817UL, 6311591UL,  9504531UL,  13208721UL, 10685598UL,
+    13878613UL, 21517317UL, 32780532UL, 30698366UL, 31287398UL, 19825399UL, 21792869UL, 22619674UL, 17486474UL,
+    7973848UL,  11794216UL, 25389563UL, 22479223UL, 25573220UL, 23948936UL, 10625066UL, 11645093UL, 22265301UL,
+    17063911UL, 1895913UL,  33525717UL, 25769517UL, 24152950UL, 4170837UL,  43411UL,    21334198UL, 28672299UL,
+    1203717UL,  28811205UL, 21122654UL, 28767450UL, 14180858UL, 18317611UL, 30102828UL, 25562349UL, 10823295UL,
+    13091725UL, 2118733UL,  29101573UL, 15563785UL, 26011467UL, 27985880UL, 28779329UL, 29788590UL, 30062884UL,
+    27816791UL, 8750868UL,  16575739UL, 24048650UL, 29914995UL, 22462391UL, 22707896UL, 18721704UL, 314085UL,
+    31186750UL, 10018193UL, 31967539UL, 6264366UL,  16561504UL, 18416494UL, 7830509UL,  2642058UL,  977870UL,
+    9456767UL,  33378933UL, 15992045UL, 29990550UL, 9775025UL,  4336099UL,  30773904UL, 15193563UL, 1441997UL,
+    32979051UL, 19572332UL, 17893312UL, 32678303UL, 20863324UL, 20990381UL, 12779825UL, 28352355UL, 12710931UL,
+    27350079UL, 3335909UL,  10352690UL, 30415481UL, 12518432UL, 4474850UL,  11676285UL, 8168426UL,  6951200UL,
+    9284900UL,  5300953UL,  15329355UL, 1076007UL,  3871696UL,  29357304UL, 4011855UL,  22597022UL, 3786619UL,
+    425863UL,   20334859UL, 10809949UL, 11628285UL, 27101248UL, 29027931UL, 22131263UL, 17330184UL, 20906908UL,
+    19362122UL, 33283941UL, 1796243UL,  25788807UL, 15237453UL, 24602782UL, 18465722UL, 14958840UL, 32923738UL,
+    10114397UL, 11429365UL, 22225279UL, 31844727UL, 10082134UL, 7047947UL,  1185367UL,  15522994UL, 924712UL,
+    23136257UL, 13755840UL, 14755900UL, 28275748UL, 14417847UL, 16332332UL, 6451815UL,  29224816UL, 21877886UL,
+    29149706UL, 27455880UL, 12125177UL, 29151314UL, 7124962UL,  27932115UL, 9023745UL,  28161749UL, 16520468UL,
+    28021786UL, 22036607UL, 20180397UL, 14214390UL, 11746268UL, 4349093UL,  29152082UL, 7228660UL,  12493287UL,
+    3558107UL,  10786785UL, 7287834UL,  161179UL,   7484811UL,  9404414UL,  31999008UL, 31443209UL, 14536598UL,
+    32111243UL, 13004561UL, 13314491UL, 22614666UL, 9393901UL,  27214400UL, 23097489UL, 33413096UL, 4206354UL,
+    19321009UL, 28838211UL, 26094633UL, 19167608UL, 904500UL,   14530905UL, 14584621UL, 19034419UL, 24953440UL,
+    13149512UL, 32044139UL, 13689055UL, 32583141UL, 29002185UL, 23439740UL, 24382420UL, 14232339UL, 20197979UL,
+    32360529UL, 15839209UL, 2902174UL,  17410487UL, 7714558UL,  14578774UL, 6413216UL,  29134836UL, 21130442UL,
+    27714645UL, 20757936UL, 31539781UL, 13996354UL, 10066290UL, 10453270UL, 11555957UL, 2709671UL,  30806687UL,
+    32333313UL, 27557889UL, 27082971UL, 18563928UL, 22941367UL, 23614268UL, 23335730UL, 6005667UL,  30863814UL,
+    24531958UL, 21545787UL, 23035233UL, 363481UL,   27894912UL, 4967407UL,  32172130UL, 203219UL,   14892294UL,
+    21996182UL, 14722657UL, 14927710UL, 5208335UL,  14868410UL, 4131039UL,  25806487UL, 33008564UL, 3701821UL,
+    14501577UL, 27701215UL, 14938255UL, 16453769UL, 18912913UL, 1504005UL,  32373056UL, 17510608UL, 2223871UL,
+    10593232UL, 26488835UL, 16026301UL, 18819156UL, 10561124UL, 24344933UL, 21090199UL, 28343976UL, 387862UL,
+    30741287UL, 21741935UL, 25871866UL, 22917905UL, 25346190UL, 30232209UL, 15808083UL, 2633940UL,  13106618UL,
+    13405090UL, 8836819UL,  6140288UL,  30997314UL, 12453806UL, 26781599UL, 23820120UL, 10086022UL, 31339212UL,
+    20646301UL, 16789368UL, 5239899UL,  4635566UL,  7627787UL,  862334UL,   22850145UL, 15208465UL, 3398142UL,
+    18720877UL, 1559121UL,  16825588UL, 16008491UL, 14092397UL, 24457515UL, 3738517UL,  23769336UL, 15487548UL,
+    32868219UL, 27671327UL, 31481704UL, 22470337UL, 22736807UL, 9710795UL,  28385605UL, 27034070UL, 11076525UL,
+    19901972UL, 73349UL,    22097652UL, 20397426UL, 28467959UL, 28602565UL, 4050701UL,  25159587UL, 8762120UL,
+    23208538UL, 17344817UL, 25233487UL, 14230073UL, 29450242UL, 16686991UL, 4776610UL,  33347147UL, 20507279UL,
+    28281238UL, 12635573UL, 31840911UL, 17440956UL, 29109043UL, 24065008UL, 24716029UL, 21080230UL, 10542798UL,
+    19784074UL, 17229015UL, 4436858UL,  5164922UL,  1992343UL,  23413640UL, 30876666UL, 6586328UL,  31989049UL,
+    22623503UL, 15695333UL, 24816278UL, 12763452UL, 31648717UL, 19361541UL, 22256049UL, 16333000UL, 21336587UL,
+    2572482UL,  21988484UL, 24219042UL, 32104812UL, 30264947UL, 22136366UL, 7899711UL,  2315398UL,  33433949UL,
+    4482406UL,  22789081UL, 32479325UL, 30802322UL, 19468514UL, 28855962UL, 16252480UL, 28852284UL, 4329921UL,
+    26603884UL, 9607683UL,  3993534UL,  22824595UL, 24921023UL, 24265472UL, 16716592UL, 29991234UL, 30919290UL,
+    16069669UL, 1451631UL,  28132010UL, 21600795UL, 11696384UL, 12141439UL, 12415176UL, 21997959UL, 16283320UL,
+    25156914UL, 10118982UL, 29534413UL, 27004743UL, 3870239UL,  2862887UL,  3338313UL,  13851315UL, 12591119UL,
+    22210454UL, 6451321UL,  1962769UL,  31533676UL, 32287026UL, 7986219UL,  23576163UL, 31579294UL, 10956606UL,
+    28771047UL, 16788309UL, 509242UL,   21784312UL, 19310686UL, 8255786UL,  2282679UL,  25704482UL, 31003909UL,
+    8654433UL,  28143130UL, 31466775UL, 28655675UL, 550962UL,   33399372UL, 18097124UL, 4288060UL,  5056651UL,
+    23999886UL, 29540101UL, 23131747UL, 21162650UL, 6533274UL,  18269111UL, 13661953UL, 7456497UL,  32999558UL,
+    19509738UL, 22888892UL, 18293855UL, 6963738UL,  24227357UL, 24455734UL, 21379715UL, 13786311UL, 14704521UL,
+    9070563UL,  28272480UL, 9783535UL,  17667453UL, 13978310UL, 9231442UL,  2718655UL,  4103165UL,  25914536UL,
+    7809617UL,  28100782UL, 5969313UL,  31533654UL, 30483751UL, 7491641UL,  8815262UL,  31878722UL, 28129453UL,
+    10442843UL, 7788104UL,  19311139UL, 29982117UL, 12963903UL, 25890210UL, 5101508UL,  1992312UL,  9340091UL,
+    19895294UL, 28871431UL, 15064964UL, 3872902UL,  10577941UL, 636907UL,   12377382UL, 5664887UL,  3182559UL,
+    13745729UL, 13642009UL, 10372352UL, 65136UL,    13452143UL, 21520372UL, 3381827UL,  28688659UL, 21939336UL,
+    28153827UL, 14979796UL, 1116833UL,  17084894UL, 31139738UL, 17118574UL, 3580149UL,  23664744UL, 1578440UL,
+    11392170UL, 18590744UL, 3782077UL,  21241510UL, 6788583UL,  10864290UL, 880776UL,   17965949UL, 29486151UL,
+    20983717UL, 15250560UL, 14779941UL, 3050971UL,  24772067UL, 28621391UL, 33742UL,    19535554UL, 32286477UL,
+    29037819UL, 9103927UL,  20310394UL, 922772UL,   29680963UL, 23397703UL, 24942951UL, 10879050UL, 10101884UL,
+    32903500UL, 15725048UL, 24601333UL, 12420035UL, 8843293UL,  8699661UL,  10134103UL, 11562794UL, 8309186UL,
+    15982374UL, 31874525UL, 11109240UL, 22184334UL, 496592UL,   26621934UL, 156272UL,   9435591UL,  32762189UL,
+    28547069UL, 8402914UL,  12979074UL, 20698874UL, 25579809UL, 1375766UL,  50154UL,    20440428UL, 5296599UL,
+    21394463UL, 23133892UL, 30830310UL, 32779141UL, 29823061UL, 11314581UL, 23039466UL, 19576435UL, 15346572UL,
+    26861738UL, 33076026UL, 25601961UL, 4721310UL,  26319920UL, 8191469UL,  32119256UL, 31901724UL, 8301136UL,
+    15900368UL, 31259182UL, 20264737UL, 18021753UL, 14858321UL, 8539610UL,  31238265UL, 14794310UL, 12683464UL,
+    14848348UL, 28058540UL, 27602508UL, 26793783UL, 30360480UL, 24218925UL, 11331627UL, 12260539UL, 20301244UL,
+    12644457UL, 25849673UL, 3136633UL,  7804452UL,  19301493UL, 11269807UL, 11276353UL, 12488792UL, 23043039UL,
+    17476303UL, 19950978UL, 17924496UL, 27079529UL, 15850966UL, 22125298UL, 10012299UL, 31092100UL, 30670471UL,
+    383111UL,   27561048UL, 2916253UL,  13759221UL, 14607496UL, 25383401UL, 13175705UL, 11263592UL, 15021387UL,
+    30974528UL, 21369635UL, 30299943UL, 28898319UL, 126128UL,   31176918UL, 22954060UL, 21909355UL, 13670599UL,
+    23093373UL, 13635501UL, 11635816UL, 14368366UL, 11846663UL, 15137286UL, 13585492UL, 33273894UL, 29302524UL,
+    8767391UL,  26457584UL, 6015060UL,  10680367UL, 7318868UL,  12272754UL, 6350918UL,  9277840UL,  66535UL,
+    30715603UL, 13003894UL, 18060509UL, 4322691UL,  18485959UL, 14319729UL, 15098763UL, 7423556UL,  24422044UL,
+    13433313UL, 28375948UL, 7634297UL,  9245000UL,  13695235UL, 14891887UL, 19074486UL, 6554998UL,  5872307UL,
+    29051416UL, 30352081UL, 32659759UL, 16171962UL, 6361527UL,  4609250UL,  3767960UL,  7837163UL,  260951UL,
+    10641575UL, 15298681UL, 1474531UL,  26871669UL, 8703357UL,  13733896UL, 11717400UL, 12784818UL, 15508000UL,
+    3799971UL,  15461152UL, 994715UL,   25854692UL, 19980746UL, 4789782UL,  16358188UL, 23189647UL, 5543253UL,
+    24067766UL, 19128223UL, 18929690UL, 12458994UL, 19381831UL, 10157829UL, 14812178UL, 26822153UL, 22195722UL,
+    20642882UL, 21294725UL, 7456406UL,  26489529UL, 22795817UL, 4476336UL,  2302566UL,  26585199UL, 18045335UL,
+    6439522UL,  1945472UL,  9011535UL,  22660945UL, 3126476UL,  24291823UL, 4572415UL,  25064797UL, 18423115UL,
+    28196419UL, 24499308UL, 25611025UL, 10695728UL, 5615784UL,  30602967UL, 155834UL,   33292921UL, 30235608UL,
+    23820168UL, 17327129UL, 29274270UL, 16646318UL, 14484908UL, 21180022UL, 7034625UL,  18864644UL, 32351370UL,
+    13507188UL, 28806163UL, 30537011UL, 13782870UL, 23391918UL, 23688874UL, 26664664UL, 26189465UL, 10335703UL,
+    26716571UL, 5648290UL,  29538030UL, 10916096UL, 30606481UL, 32261757UL, 26584204UL, 31802949UL, 16159938UL,
+    10702817UL, 6734611UL,  28771017UL, 22517641UL, 4454221UL,  20026836UL, 17351231UL, 10610357UL, 24006082UL,
+    8346423UL,  33364734UL, 6896974UL,  6473406UL,  27398362UL, 18054395UL, 14280268UL, 14030222UL, 21342167UL,
+    6624536UL,  4155048UL,  10389268UL, 31294766UL, 19368195UL, 24509143UL, 31502623UL, 4747420UL,  7943757UL,
+    30477672UL, 28761691UL, 12955543UL, 19012690UL, 1541614UL,  23617088UL, 28854723UL, 26367977UL, 17393784UL,
+    2358558UL,  28813660UL, 7782762UL,  21509977UL, 26770878UL, 31537971UL, 1839009UL,  27942462UL, 19798724UL,
+    10385319UL, 14336032UL, 28579323UL, 29132206UL, 3743359UL,  25132966UL, 13150599UL, 22817231UL, 25817074UL,
+    17393301UL, 14503973UL, 31709050UL, 5783375UL,  5516613UL,  11196347UL, 568256UL,   32876866UL, 4779231UL,
+    4594379UL,  12085171UL, 29028808UL, 6768914UL,  13693737UL, 31949582UL, 17125455UL, 16391587UL, 32616961UL,
+    9146422UL,  16137111UL, 15561097UL, 24745056UL, 18310072UL, 21417597UL, 21742058UL, 17266894UL, 30657252UL,
+    7184037UL,  31773167UL, 9841006UL,  31321107UL, 18726751UL, 17081202UL, 10513453UL,
+};
+uint32_t rand_arr_26_b26_w32_arr[1024] = {
+    44834527UL, 23960486UL, 28851186UL, 63978911UL, 46626624UL, 58973535UL, 46128586UL, 33578249UL, 48085202UL,
+    62691450UL, 19699219UL, 22575637UL, 10984776UL, 2811647UL,  52402111UL, 56394541UL, 8798027UL,  6696199UL,
+    17131540UL, 30365506UL, 55337008UL, 2031017UL,  63878683UL, 412343UL,   40996204UL, 422307UL,   40640404UL,
+    14124856UL, 3402354UL,  6788069UL,  53655825UL, 43923088UL, 11116080UL, 49486842UL, 1210869UL,  28398608UL,
+    2956416UL,  7748800UL,  36802858UL, 31744397UL, 33855708UL, 7132911UL,  20739267UL, 773347UL,   3068936UL,
+    13386562UL, 52091546UL, 15014885UL, 54183810UL, 163373UL,   30190600UL, 4496202UL,  26143262UL, 36956447UL,
+    26195478UL, 39935726UL, 1259219UL,  64233968UL, 31467155UL, 47668103UL, 57089042UL, 56587782UL, 54780445UL,
+    57727787UL, 8737917UL,  47698977UL, 5096590UL,  18021548UL, 32252292UL, 14020633UL, 9892752UL,  56481244UL,
+    35709946UL, 45133324UL, 17828642UL, 18553462UL, 46776130UL, 16733799UL, 38949042UL, 48800866UL, 22754133UL,
+    49612303UL, 25834970UL, 13445624UL, 55750873UL, 45123011UL, 43719628UL, 2375403UL,  5826415UL,  27732774UL,
+    15393410UL, 26690036UL, 3703320UL,  15265517UL, 45779143UL, 51163416UL, 50288438UL, 7006970UL,  43227616UL,
+    3700630UL,  56402925UL, 11593206UL, 55894692UL, 42168225UL, 55716070UL, 25625628UL, 8709304UL,  13519003UL,
+    8561985UL,  29245515UL, 40618736UL, 17625841UL, 3475985UL,  9897364UL,  34653892UL, 17701931UL, 36162661UL,
+    43398544UL, 20405737UL, 46360050UL, 14452353UL, 51830409UL, 33656378UL, 17864737UL, 12858648UL, 6458384UL,
+    55421383UL, 61727916UL, 57414332UL, 13523030UL, 52694668UL, 42249202UL, 380116UL,   36440214UL, 16221605UL,
+    19355864UL, 50974839UL, 14334239UL, 25344325UL, 53162359UL, 11722862UL, 57297615UL, 20994122UL, 19218448UL,
+    48022781UL, 26395128UL, 45874204UL, 58276936UL, 7650042UL,  60264509UL, 19635440UL, 37799641UL, 48451296UL,
+    19629266UL, 43291116UL, 24258937UL, 53706001UL, 26841164UL, 21604040UL, 57064793UL, 28588141UL, 7311662UL,
+    27025133UL, 46809122UL, 20568114UL, 58411090UL, 54551127UL, 47038352UL, 4180436UL,  37504597UL, 60150334UL,
+    18449303UL, 55117355UL, 11138325UL, 50455412UL, 29710374UL, 53586459UL, 22730323UL, 14274215UL, 45294168UL,
+    37585445UL, 14161006UL, 50477053UL, 9107234UL,  18589314UL, 5065615UL,  23558851UL, 27807033UL, 9622895UL,
+    28412907UL, 65003381UL, 34709815UL, 22982068UL, 6339083UL,  51772579UL, 38495915UL, 59040265UL, 30168128UL,
+    1604181UL,  16022069UL, 38230719UL, 6396647UL,  49349855UL, 24724020UL, 54974368UL, 36020377UL, 42848892UL,
+    4793869UL,  65404082UL, 45765054UL, 2906972UL,  9770712UL,  18990094UL, 37329110UL, 17154935UL, 63464985UL,
+    63289759UL, 67019756UL, 31365283UL, 44090467UL, 19404678UL, 17602545UL, 61939648UL, 34570714UL, 7872735UL,
+    55498319UL, 31125354UL, 48553962UL, 37799934UL, 52505830UL, 41378920UL, 56911186UL, 37872951UL, 34078976UL,
+    61290754UL, 49933260UL, 32362894UL, 4819856UL,  52384972UL, 60870238UL, 31448785UL, 27572159UL, 24563542UL,
+    57073683UL, 54030938UL, 29540472UL, 38348263UL, 19230691UL, 58636145UL, 20465945UL, 62850462UL, 4786203UL,
+    48999656UL, 47952223UL, 63247347UL, 4790528UL,  17132591UL, 25387558UL, 13023848UL, 49735247UL, 49394849UL,
+    4837066UL,  51374358UL, 37578406UL, 6800800UL,  64197392UL, 23088292UL, 58971929UL, 19276969UL, 37478411UL,
+    28604012UL, 54957295UL, 24051892UL, 24435803UL, 21698705UL, 21559991UL, 12951320UL, 31084291UL, 50165804UL,
+    25021048UL, 64891486UL, 24893322UL, 62102052UL, 53955997UL, 36213987UL, 19582565UL, 50982119UL, 40128296UL,
+    46276238UL, 1137861UL,  29928396UL, 14361962UL, 4388883UL,  19868028UL, 3852648UL,  34491208UL, 50428550UL,
+    14289988UL, 66184108UL, 38835566UL, 40733679UL, 45463518UL, 24273802UL, 28275727UL, 57601938UL, 27997112UL,
+    33383416UL, 12073351UL, 43688752UL, 62016699UL, 23229444UL, 6840374UL,  58035450UL, 20651023UL, 30074572UL,
+    58959910UL, 52962707UL, 48303204UL, 10797062UL, 52220141UL, 58883127UL, 34650445UL, 30591269UL, 40018693UL,
+    54693066UL, 8293404UL,  22618383UL, 62538438UL, 57752310UL, 32837468UL, 46430613UL, 11312431UL, 46648924UL,
+    37459779UL, 40315480UL, 38084485UL, 3051532UL,  56459367UL, 27819988UL, 26470059UL, 63482300UL, 18931182UL,
+    14647079UL, 3807566UL,  6034670UL,  35252993UL, 7320870UL,  45611576UL, 58903953UL, 34390195UL, 48643226UL,
+    49098406UL, 55857243UL, 10912154UL, 55033641UL, 62430420UL, 46728521UL, 29401771UL, 25512504UL, 64918761UL,
+    12047293UL, 49699004UL, 17620988UL, 14417962UL, 40954932UL, 59767554UL, 32279478UL, 66266140UL, 33891523UL,
+    53145042UL, 31313067UL, 8545197UL,  35025791UL, 58829710UL, 288491UL,   7816232UL,  38031620UL, 62200747UL,
+    43099292UL, 16612UL,    19499440UL, 18453906UL, 49481782UL, 13948256UL, 41412341UL, 5321890UL,  386478UL,
+    39692244UL, 26343196UL, 9219758UL,  30623558UL, 60043758UL, 18576238UL, 44016741UL, 59168371UL, 35308705UL,
+    5856420UL,  62962821UL, 5253138UL,  48091455UL, 5133784UL,  15149390UL, 19462106UL, 45787822UL, 58935781UL,
+    63203829UL, 64655467UL, 4990274UL,  52587206UL, 48622642UL, 40236262UL, 49834751UL, 5641904UL,  13549748UL,
+    31927878UL, 58858635UL, 20852612UL, 64190519UL, 53722007UL, 30929926UL, 1321031UL,  15321695UL, 58557903UL,
+    34201223UL, 29268678UL, 43952612UL, 4466925UL,  51176952UL, 38552380UL, 21985134UL, 15387350UL, 6099440UL,
+    5236160UL,  42675226UL, 3881146UL,  50059924UL, 608834UL,   44897451UL, 33553334UL, 45607767UL, 62546690UL,
+    66174715UL, 63621572UL, 39033264UL, 1677012UL,  47222657UL, 24157319UL, 50992009UL, 52909406UL, 6119303UL,
+    36529937UL, 42948469UL, 58036583UL, 60490360UL, 91503UL,    44174710UL, 494973UL,   18758227UL, 62071933UL,
+    33123767UL, 50024986UL, 9657375UL,  40028978UL, 55509417UL, 41339141UL, 42593830UL, 43670887UL, 25079500UL,
+    53521052UL, 5504269UL,  50715197UL, 406368UL,   44772926UL, 57580700UL, 51872233UL, 37404621UL, 52936145UL,
+    9870261UL,  16858387UL, 26046297UL, 38745700UL, 54699165UL, 34839334UL, 37061742UL, 27550688UL, 17491239UL,
+    61471626UL, 28609678UL, 1840996UL,  23570025UL, 37307128UL, 60129037UL, 45698353UL, 41206084UL, 6747859UL,
+    26006529UL, 18230831UL, 5018112UL,  32413343UL, 33503964UL, 958487UL,   8749014UL,  62960378UL, 35125414UL,
+    33238110UL, 46145246UL, 49803884UL, 22360034UL, 15609272UL, 61479995UL, 17763859UL, 31591536UL, 61274479UL,
+    51379286UL, 12936495UL, 56635667UL, 45823225UL, 3678066UL,  39951176UL, 47142791UL, 5489477UL,  59832832UL,
+    35725638UL, 18507550UL, 64224060UL, 41989536UL, 40887727UL, 45194719UL, 33311892UL, 2603444UL,  55742548UL,
+    29538187UL, 21758972UL, 49833572UL, 41119593UL, 13153242UL, 50888206UL, 23013659UL, 51130145UL, 49502245UL,
+    34624873UL, 2458477UL,  8624524UL,  6119541UL,  10146774UL, 7675638UL,  59849359UL, 29196333UL, 43105196UL,
+    25769900UL, 26100514UL, 14789611UL, 11831819UL, 22152317UL, 47316650UL, 9878523UL,  65583475UL, 53358918UL,
+    23361060UL, 23925173UL, 11175531UL, 12899683UL, 57459074UL, 42563621UL, 14524678UL, 64431446UL, 38498884UL,
+    36408941UL, 21228524UL, 8938268UL,  54862842UL, 22433660UL, 41510287UL, 42491275UL, 49989507UL, 22559770UL,
+    2947603UL,  18151044UL, 22292687UL, 63417267UL, 12402110UL, 55862825UL, 30610543UL, 35303470UL, 23944117UL,
+    49090025UL, 54936939UL, 37846525UL, 40393485UL, 24190768UL, 55522013UL, 40693446UL, 37926778UL, 21853693UL,
+    31240357UL, 38942023UL, 11031258UL, 32145609UL, 18628833UL, 60107856UL, 42978202UL, 26361590UL, 21069997UL,
+    47572162UL, 18644603UL, 42441284UL, 63990366UL, 50671353UL, 2781046UL,  53608339UL, 38231214UL, 55881184UL,
+    22130267UL, 61658924UL, 34354670UL, 41906449UL, 22244627UL, 27683803UL, 13770033UL, 7070821UL,  64001802UL,
+    35346464UL, 64044161UL, 9211452UL,  49851457UL, 7008502UL,  1796059UL,  20251162UL, 31796882UL, 14504021UL,
+    34860006UL, 51358852UL, 9507896UL,  10634349UL, 26941557UL, 56018957UL, 10373571UL, 50251628UL, 40772519UL,
+    32321039UL, 8928457UL,  10152928UL, 61232280UL, 13689690UL, 9998438UL,  48774144UL, 2529441UL,  17737433UL,
+    2908655UL,  15065083UL, 35686202UL, 13524342UL, 17152460UL, 1258870UL,  38902932UL, 52152546UL, 66120690UL,
+    55501264UL, 61778368UL, 61715529UL, 14228550UL, 47648231UL, 49072108UL, 21114044UL, 62954953UL, 30201034UL,
+    34016937UL, 35829490UL, 1386671UL,  46137742UL, 9941322UL,  44187452UL, 43538050UL, 66921211UL, 16064876UL,
+    39837161UL, 65541271UL, 49028763UL, 37544305UL, 213235UL,   7494822UL,  61418020UL, 59500708UL, 45867893UL,
+    35352287UL, 47121559UL, 3740172UL,  7452951UL,  33871697UL, 43014321UL, 1191035UL,  32407352UL, 63186097UL,
+    14853632UL, 11071720UL, 63691901UL, 64907741UL, 52089391UL, 12657245UL, 65277315UL, 61009104UL, 65228885UL,
+    41606529UL, 48902391UL, 47001721UL, 4855020UL,  10570535UL, 55205715UL, 55079992UL, 32469387UL, 28821647UL,
+    51283907UL, 13127366UL, 48263733UL, 8009922UL,  12670811UL, 13943833UL, 10568013UL, 65596508UL, 30299897UL,
+    41839781UL, 28585906UL, 31539140UL, 29185850UL, 3146999UL,  26087594UL, 41954626UL, 32001221UL, 61850704UL,
+    6675405UL,  28154747UL, 3759517UL,  43742047UL, 53871864UL, 39391548UL, 19884389UL, 65594021UL, 51046836UL,
+    8933741UL,  56037889UL, 44624736UL, 31762065UL, 27663477UL, 20196087UL, 3982691UL,  10840913UL, 49370719UL,
+    10668798UL, 21079872UL, 65522664UL, 57431136UL, 58688528UL, 5104852UL,  39282631UL, 29469935UL, 3210108UL,
+    39958443UL, 17130140UL, 32530684UL, 66768271UL, 31602385UL, 22031074UL, 36334874UL, 49194421UL, 62621226UL,
+    48472948UL, 35167476UL, 30981215UL, 17675999UL, 66645802UL, 6175672UL,  6996861UL,  37522777UL, 2518030UL,
+    61954046UL, 61594701UL, 7439629UL,  27819007UL, 39620039UL, 4533960UL,  57769779UL, 44525210UL, 60919318UL,
+    30387768UL, 55741265UL, 54490672UL, 14555729UL, 15812453UL, 26149062UL, 1959242UL,  25549570UL, 8317639UL,
+    27599522UL, 58777533UL, 10734530UL, 62907499UL, 54293515UL, 27132004UL, 49622425UL, 60357310UL, 43094987UL,
+    3930984UL,  15278120UL, 33303783UL, 9168669UL,  13284657UL, 31326341UL, 48046039UL, 14185109UL, 5163461UL,
+    52066658UL, 1372463UL,  50003449UL, 20481395UL, 35471702UL, 31039988UL, 13654466UL, 61950484UL, 52553209UL,
+    38143310UL, 19180780UL, 55879343UL, 30507873UL, 32696652UL, 12541808UL, 54812179UL, 36282671UL, 44435245UL,
+    20087219UL, 60165265UL, 21500652UL, 29385869UL, 9692767UL,  43194674UL, 9382917UL,  29910861UL, 57894682UL,
+    33029912UL, 18017588UL, 43225309UL, 40625813UL, 39681172UL, 60065981UL, 63745168UL, 51907593UL, 24015884UL,
+    5243910UL,  15243220UL, 50369032UL, 50898397UL, 46445938UL, 38913630UL, 7928816UL,  18801524UL, 58901541UL,
+    48962055UL, 49647679UL, 41529518UL, 54342932UL, 66036073UL, 51551796UL, 47606163UL, 56674011UL, 54034590UL,
+    43903400UL, 50008146UL, 63762727UL, 27305193UL, 42243253UL, 14458968UL, 53754967UL, 7142906UL,  14674261UL,
+    53890411UL, 37844697UL, 62479459UL, 24539391UL, 58271119UL, 38763037UL, 35161631UL, 28634748UL, 16469684UL,
+    50456363UL, 59658230UL, 48757153UL, 12217372UL, 63343393UL, 18188671UL, 27199969UL, 33201346UL, 56960105UL,
+    59104473UL, 34362763UL, 16359746UL, 38609250UL, 53686594UL, 28102231UL, 53606692UL, 24806557UL, 53112526UL,
+    49195200UL, 60124429UL, 55251422UL, 17091999UL, 17783756UL, 26659932UL, 43774570UL, 62481747UL, 19479461UL,
+    727282UL,   46550097UL, 6174501UL,  59571792UL, 58089138UL, 8323577UL,  55189364UL, 48836929UL, 25215278UL,
+    16851287UL, 53078015UL, 63006833UL, 28552769UL, 35926429UL, 6710665UL,  13859543UL, 53954968UL, 65780374UL,
+    43606186UL, 56621313UL, 27477654UL, 18287611UL, 31342467UL, 59948182UL, 57633576UL, 48366250UL, 28512278UL,
+    51686336UL, 39648622UL, 42708405UL, 19147841UL, 58756690UL, 31043967UL, 517099UL,   2900191UL,  15956441UL,
+    23256064UL, 19649114UL, 8433200UL,  66637159UL, 23105451UL, 7711467UL,  60195130UL, 55271401UL, 47384467UL,
+    24138609UL, 16294227UL, 34777735UL, 64872052UL, 43271979UL, 57183829UL, 34264863UL, 35795787UL, 8321974UL,
+    39943299UL, 3943978UL,  34501943UL, 18635287UL, 7120552UL,  3101378UL,  36215678UL, 4599250UL,  7188702UL,
+    12469228UL, 32538734UL, 14001146UL, 34478819UL, 37887084UL, 66679689UL, 35597617UL, 58038930UL, 16784892UL,
+    10404503UL, 15535029UL, 51043004UL, 54623839UL, 32842468UL, 64669481UL, 33045526UL, 10301233UL, 49603147UL,
+    57352068UL, 22805495UL, 2205018UL,  51134298UL, 38819853UL, 54236989UL, 17844470UL, 14580637UL, 4614122UL,
+    20508626UL, 20173918UL, 45386149UL, 18002198UL, 60004633UL, 42239051UL, 448934UL,   52698632UL, 18284728UL,
+    63261292UL, 57050531UL, 6513350UL,  39950151UL, 49250402UL, 23324050UL, 66367654UL, 1197345UL,  23577539UL,
+    63277864UL, 26441051UL, 4983622UL,  52745119UL, 38305368UL, 51524595UL, 52337281UL,
+};
+uint32_t rand_arr_27_b27_w32_arr[1024] = {
+    37002011UL,  37307707UL,  59119128UL,  130958791UL, 97602425UL,  84081428UL,  11586349UL,  73396385UL,  54882228UL,
+    12234555UL,  81347315UL,  120675840UL, 101582827UL, 77590258UL,  125070951UL, 19597829UL,  126718655UL, 30355719UL,
+    65247456UL,  114683385UL, 59396671UL,  80885924UL,  78930100UL,  796102UL,    26483389UL,  57518731UL,  123615360UL,
+    65982915UL,  89788142UL,  97934228UL,  12233831UL,  44681059UL,  24863759UL,  8458276UL,   42685617UL,  105339373UL,
+    87805769UL,  120626274UL, 96722078UL,  76011118UL,  122339999UL, 118232721UL, 108143175UL, 75938508UL,  95249886UL,
+    19452714UL,  82567773UL,  88740114UL,  84451583UL,  41881157UL,  112071369UL, 61809798UL,  80407282UL,  105094654UL,
+    11558380UL,  64677911UL,  37131926UL,  33192742UL,  2322768UL,   12673488UL,  2049267UL,   26396316UL,  31436166UL,
+    123404879UL, 13678583UL,  80703508UL,  60943901UL,  29501366UL,  130137235UL, 43979871UL,  74986802UL,  5629390UL,
+    47114568UL,  72313674UL,  121887322UL, 57686467UL,  34965589UL,  109110789UL, 94705862UL,  94080030UL,  120388197UL,
+    7978722UL,   104831193UL, 128590092UL, 16397377UL,  38394824UL,  66843730UL,  125934943UL, 55841068UL,  46034774UL,
+    98827428UL,  34084252UL,  47646585UL,  38188852UL,  69112957UL,  28996917UL,  112954958UL, 132208757UL, 129304720UL,
+    41377489UL,  79570987UL,  126874208UL, 922762UL,    9454750UL,   41825856UL,  18761417UL,  17183052UL,  119735387UL,
+    91730591UL,  22109502UL,  75951742UL,  45180126UL,  110964593UL, 17098846UL,  100893025UL, 12780168UL,  56332370UL,
+    85895152UL,  88259433UL,  133335220UL, 101236024UL, 100141491UL, 68960245UL,  66591958UL,  112940828UL, 14248646UL,
+    121952392UL, 119740044UL, 55963979UL,  115205550UL, 80744255UL,  13353872UL,  110286185UL, 55618268UL,  108120862UL,
+    23735326UL,  125545395UL, 109869925UL, 15044215UL,  92475649UL,  4314478UL,   17040899UL,  68954487UL,  113027484UL,
+    4084637UL,   97684520UL,  131375487UL, 12342819UL,  88144591UL,  17838061UL,  45130014UL,  47948423UL,  99214107UL,
+    18761877UL,  48190141UL,  71225776UL,  115778358UL, 117260890UL, 76134388UL,  53650923UL,  89424174UL,  44869190UL,
+    77189279UL,  89966685UL,  12897562UL,  3281161UL,   63384814UL,  73796743UL,  127187603UL, 27199274UL,  42355940UL,
+    28940011UL,  71252552UL,  74699180UL,  75839114UL,  23025125UL,  16692831UL,  3174223UL,   124647284UL, 104295499UL,
+    99564035UL,  28741342UL,  92097442UL,  42193425UL,  44299894UL,  46484319UL,  117994234UL, 100027742UL, 73973409UL,
+    133699426UL, 80467323UL,  84070826UL,  57615071UL,  36032872UL,  114713474UL, 58498573UL,  23791218UL,  106916622UL,
+    126355444UL, 81963908UL,  85366537UL,  74415284UL,  68255497UL,  59931223UL,  128951350UL, 87816293UL,  22221343UL,
+    56763122UL,  46288741UL,  90845797UL,  24160469UL,  62240796UL,  28657756UL,  15531236UL,  90510433UL,  56315783UL,
+    51892614UL,  83334758UL,  46681057UL,  39258902UL,  116202732UL, 116759226UL, 98630511UL,  19855804UL,  29015668UL,
+    80761985UL,  60613180UL,  37485309UL,  85136149UL,  16085548UL,  106044867UL, 117867358UL, 87984698UL,  89535463UL,
+    8455910UL,   104799268UL, 6203479UL,   117723742UL, 79133977UL,  104458953UL, 79532286UL,  124703866UL, 117019635UL,
+    15946980UL,  97689018UL,  71821127UL,  50288035UL,  127415833UL, 115977956UL, 19587237UL,  18807367UL,  5504783UL,
+    68778434UL,  107960870UL, 91947598UL,  126750591UL, 82014804UL,  77425261UL,  92234290UL,  18235404UL,  50801527UL,
+    120944743UL, 61321471UL,  33892881UL,  116308234UL, 43496647UL,  33286031UL,  120762977UL, 116215007UL, 113251846UL,
+    103755862UL, 34513236UL,  22821497UL,  108431905UL, 88998371UL,  25699952UL,  109488241UL, 103806935UL, 128783831UL,
+    106768365UL, 121766800UL, 3020461UL,   68514839UL,  81819896UL,  24720042UL,  34663122UL,  39702964UL,  119285063UL,
+    127959561UL, 32245237UL,  118876701UL, 80407837UL,  132790347UL, 65017541UL,  49811415UL,  64797202UL,  7342087UL,
+    91252221UL,  25483680UL,  77523466UL,  130555494UL, 25211863UL,  82340556UL,  21432039UL,  119226877UL, 15364478UL,
+    95945879UL,  9911877UL,   67240388UL,  102326200UL, 1009633UL,   42587847UL,  74380672UL,  115191783UL, 26358622UL,
+    24908104UL,  10603472UL,  32287277UL,  48790720UL,  101645105UL, 87688254UL,  22947673UL,  24535505UL,  69743549UL,
+    61546222UL,  10748438UL,  63880744UL,  85515291UL,  110062262UL, 94801906UL,  104626438UL, 46577011UL,  55114115UL,
+    28088546UL,  93585375UL,  8989841UL,   44603418UL,  109789848UL, 68547749UL,  39529956UL,  110335517UL, 11785885UL,
+    80603069UL,  17529088UL,  5390931UL,   105753167UL, 12347803UL,  90837950UL,  4778592UL,   70643012UL,  75124217UL,
+    70856567UL,  31093628UL,  129669068UL, 28218741UL,  33463836UL,  60125763UL,  123367257UL, 133005002UL, 58399508UL,
+    31740807UL,  46098429UL,  58291326UL,  48085148UL,  85392237UL,  94811569UL,  98773810UL,  90963574UL,  50140353UL,
+    122962318UL, 95310295UL,  23401555UL,  1172381UL,   115867437UL, 3209281UL,   113066315UL, 51773009UL,  4375774UL,
+    104031374UL, 19148645UL,  130776981UL, 32291177UL,  107735338UL, 134067709UL, 83519206UL,  33882333UL,  38599843UL,
+    109947985UL, 66267314UL,  97186630UL,  55526241UL,  125372310UL, 7670913UL,   128456253UL, 97481959UL,  64654592UL,
+    46165780UL,  25148278UL,  36971544UL,  89013177UL,  102412182UL, 71669736UL,  87650272UL,  38925545UL,  8983276UL,
+    39858121UL,  82166715UL,  39571213UL,  21756633UL,  26945589UL,  55051693UL,  13172361UL,  133292869UL, 82511285UL,
+    20486923UL,  1149665UL,   58331958UL,  94931134UL,  72171148UL,  58136054UL,  36567512UL,  108956439UL, 64762259UL,
+    102923035UL, 100878897UL, 89983884UL,  39940136UL,  80313001UL,  14543526UL,  89647458UL,  42259105UL,  44704810UL,
+    107987522UL, 47803545UL,  811314UL,    41487341UL,  131749664UL, 48649282UL,  3803118UL,   44434815UL,  50078147UL,
+    15725879UL,  105536264UL, 74899963UL,  108052594UL, 63385630UL,  132099770UL, 6869069UL,   123374312UL, 51873759UL,
+    13232523UL,  58212846UL,  73462234UL,  19154099UL,  10743210UL,  126681652UL, 299616UL,    67272332UL,  69739335UL,
+    26188584UL,  24330028UL,  53067917UL,  82553697UL,  86387630UL,  63636993UL,  109129316UL, 92499522UL,  17901537UL,
+    129480747UL, 115267718UL, 103412093UL, 94946468UL,  101093725UL, 40731240UL,  59605115UL,  41955760UL,  230524UL,
+    1000329UL,   94652295UL,  37529906UL,  48807077UL,  113923563UL, 133032602UL, 103475928UL, 43169468UL,  73681348UL,
+    21106337UL,  123668619UL, 126050204UL, 66501555UL,  119844713UL, 74458345UL,  42135646UL,  107612370UL, 131121367UL,
+    59795230UL,  108199982UL, 68348756UL,  35908991UL,  75576657UL,  5988738UL,   48457186UL,  37174983UL,  3529210UL,
+    36803715UL,  85539640UL,  14072094UL,  64979954UL,  20275699UL,  125548262UL, 43923325UL,  5969309UL,   22820962UL,
+    102593700UL, 20070402UL,  133718822UL, 43078118UL,  2342563UL,   57468653UL,  88190542UL,  108698798UL, 11051999UL,
+    43262076UL,  55139652UL,  82671753UL,  32518549UL,  127336910UL, 23242074UL,  42858532UL,  64867665UL,  110247008UL,
+    70003778UL,  14538186UL,  132673087UL, 115419232UL, 118322958UL, 47829715UL,  110600020UL, 95681285UL,  119720577UL,
+    89946084UL,  89483759UL,  35271521UL,  11697977UL,  57024791UL,  9853165UL,   88556083UL,  129521227UL, 100839669UL,
+    92915606UL,  125374360UL, 98101099UL,  96444157UL,  25165355UL,  98455913UL,  61855483UL,  74969317UL,  100326495UL,
+    131483538UL, 14265406UL,  75126142UL,  123368738UL, 77985833UL,  132345059UL, 112423844UL, 116961734UL, 76886856UL,
+    132864644UL, 25065008UL,  76956322UL,  13894268UL,  53001828UL,  49421460UL,  4038196UL,   76402533UL,  11555396UL,
+    100928573UL, 107269474UL, 131336619UL, 120337459UL, 75600688UL,  10853821UL,  11331806UL,  71104774UL,  2543069UL,
+    114379900UL, 12048045UL,  82673363UL,  39135645UL,  94783991UL,  8357013UL,   128350139UL, 33258591UL,  108594217UL,
+    46321793UL,  87737550UL,  70417533UL,  133189395UL, 59257212UL,  119658279UL, 53199094UL,  83877858UL,  67767003UL,
+    57854854UL,  110573800UL, 81544664UL,  94469181UL,  7203055UL,   88431517UL,  90832952UL,  131603971UL, 45960708UL,
+    108772032UL, 12522314UL,  123206272UL, 72149895UL,  69351719UL,  115069469UL, 14217099UL,  7943678UL,   16275081UL,
+    92757070UL,  56653130UL,  1283401UL,   78196919UL,  14736821UL,  77836532UL,  34078325UL,  6495213UL,   125944454UL,
+    107605144UL, 78635671UL,  121180149UL, 81296158UL,  21718191UL,  109133215UL, 19138490UL,  57798906UL,  95215677UL,
+    112600626UL, 73501999UL,  24624186UL,  117034585UL, 83664479UL,  119363033UL, 24997224UL,  72329270UL,  57927649UL,
+    57812562UL,  126362467UL, 68415533UL,  66914204UL,  26908589UL,  60103705UL,  25744599UL,  87916054UL,  60317909UL,
+    86833280UL,  45021747UL,  67243058UL,  119625234UL, 64077200UL,  94655323UL,  6347398UL,   63391277UL,  101287243UL,
+    48536923UL,  42122220UL,  12447157UL,  20974623UL,  95675938UL,  117165376UL, 4668063UL,   11419227UL,  125003359UL,
+    23398059UL,  42233002UL,  18497945UL,  31134418UL,  103217540UL, 22142114UL,  77214120UL,  107691059UL, 69298461UL,
+    69007443UL,  101639542UL, 57282839UL,  95829046UL,  8892085UL,   51604613UL,  88590583UL,  2517417UL,   4822926UL,
+    32309481UL,  69355394UL,  56794919UL,  132358017UL, 123347146UL, 39036091UL,  118287453UL, 24475958UL,  82470815UL,
+    38335166UL,  58600562UL,  85323413UL,  23635249UL,  92850190UL,  3009377UL,   38972130UL,  52335348UL,  66197659UL,
+    93338434UL,  61649414UL,  53206399UL,  66003338UL,  87033564UL,  3383764UL,   10278558UL,  86641878UL,  59172847UL,
+    93059366UL,  63700487UL,  48442822UL,  105749931UL, 31011333UL,  107967214UL, 127415658UL, 111714715UL, 62156120UL,
+    64081232UL,  46255564UL,  57707062UL,  78933580UL,  84243759UL,  49070113UL,  131584256UL, 34785661UL,  18329511UL,
+    49735872UL,  116807910UL, 58988789UL,  125526118UL, 30626860UL,  118911990UL, 92154066UL,  5502801UL,   106741013UL,
+    42868936UL,  4458398UL,   94677498UL,  69625627UL,  52878526UL,  93167842UL,  31494674UL,  15940293UL,  83003795UL,
+    133634907UL, 8802212UL,   8586085UL,   50849260UL,  118316393UL, 34972278UL,  23774923UL,  21342987UL,  84474544UL,
+    40042599UL,  69123763UL,  43737590UL,  16022571UL,  78274572UL,  64526055UL,  119924910UL, 32353539UL,  36510652UL,
+    17537073UL,  108796052UL, 99916620UL,  76477026UL,  92033873UL,  113367845UL, 62675369UL,  67290237UL,  99777431UL,
+    96404300UL,  39881231UL,  20486929UL,  112458753UL, 120682993UL, 52860375UL,  99129873UL,  84213397UL,  23486030UL,
+    3397168UL,   65285336UL,  81741513UL,  36711426UL,  97981440UL,  51067838UL,  120099899UL, 125231457UL, 57722668UL,
+    41279330UL,  59505891UL,  5332376UL,   115796985UL, 6542928UL,   59220655UL,  62950573UL,  81677044UL,  16917645UL,
+    2906077UL,   106516179UL, 68257152UL,  90140595UL,  73282478UL,  89772144UL,  50605669UL,  63820930UL,  117669629UL,
+    40744096UL,  127569399UL, 122051169UL, 113293890UL, 117043819UL, 20409405UL,  10382553UL,  111576049UL, 6846250UL,
+    133597279UL, 67856030UL,  62107913UL,  82368603UL,  155932UL,    34163699UL,  28437934UL,  130023291UL, 113943923UL,
+    64088113UL,  55136697UL,  56364000UL,  31017598UL,  44165050UL,  11090233UL,  291013UL,    107145430UL, 27331988UL,
+    18304540UL,  111432729UL, 113723822UL, 43010069UL,  54702188UL,  97549825UL,  66630591UL,  101890353UL, 4586602UL,
+    103663502UL, 44739767UL,  95889672UL,  3133060UL,   27150972UL,  6287634UL,   115887581UL, 62691507UL,  80756961UL,
+    102855184UL, 98523747UL,  113027576UL, 98357669UL,  126146087UL, 52730124UL,  101514996UL, 122393063UL, 55406841UL,
+    81187862UL,  105853641UL, 36666567UL,  77380252UL,  120221665UL, 5649560UL,   103601624UL, 41652845UL,  82763346UL,
+    33053383UL,  112383460UL, 59271000UL,  115127588UL, 39208803UL,  15893236UL,  95510845UL,  117180309UL, 103063969UL,
+    62213675UL,  91573965UL,  125321492UL, 108053841UL, 55999186UL,  58870385UL,  129765331UL, 47454158UL,  104341600UL,
+    37060601UL,  49647754UL,  40145232UL,  82420633UL,  130245328UL, 133079386UL, 68820193UL,  32529656UL,  132718035UL,
+    21091752UL,  105530479UL, 17738145UL,  21163244UL,  50251824UL,  122681079UL, 25580367UL,  102349591UL, 67947479UL,
+    103379864UL, 15699686UL,  92150769UL,  111973035UL, 117358778UL, 7541165UL,   94370146UL,  133215756UL, 45824107UL,
+    83915740UL,  120991840UL, 1258514UL,   117339558UL, 118488695UL, 18530492UL,  94274386UL,  21303625UL,  61356859UL,
+    19868670UL,  103113685UL, 26835461UL,  71042332UL,  40545824UL,  29789375UL,  29303980UL,  28935338UL,  40237451UL,
+    115013600UL, 99973094UL,  51118369UL,  114957498UL, 29204244UL,  20777393UL,  27505009UL,  3352698UL,   60957256UL,
+    91144995UL,  58916627UL,  130389461UL, 21322138UL,  77286097UL,  1064117UL,   110938206UL, 11620938UL,  107558770UL,
+    64642585UL,  122233870UL, 69510937UL,  50963004UL,  12228215UL,  19836689UL,  41973664UL,  15906039UL,  25709904UL,
+    89846609UL,  134192200UL, 114276910UL, 105464754UL, 950823UL,    3782803UL,   106575284UL, 67801434UL,  126863146UL,
+    32531837UL,  72603358UL,  1198321UL,   70523393UL,  118552789UL, 46220512UL,  111168312UL, 78109718UL,  87547889UL,
+    102530659UL, 7779792UL,   111915710UL, 97260953UL,  10663537UL,  14700586UL,  44148836UL,  39655872UL,  18139100UL,
+    95390931UL,  23341963UL,  125295885UL, 25464058UL,  80729303UL,  41349073UL,  115961574UL, 90317164UL,  57968607UL,
+    25029028UL,  119870138UL, 83504202UL,  84835029UL,  91785779UL,  19692957UL,  83023499UL,  76288113UL,  63878679UL,
+    4952972UL,   79233356UL,  83703639UL,  3128052UL,   112405494UL, 130263683UL, 119348532UL,
+};
+uint32_t rand_arr_28_b28_w32_arr[1024] = {
+    48343420UL,  32067618UL,  133501561UL, 192829368UL, 148999938UL, 103004681UL, 255008650UL, 37660928UL,  144238380UL,
+    51820278UL,  185474587UL, 113512287UL, 267810425UL, 38975996UL,  12936839UL,  106450999UL, 223975939UL, 44621762UL,
+    19373994UL,  142345600UL, 202967137UL, 74525763UL,  98381583UL,  27660607UL,  64599497UL,  204745553UL, 30127811UL,
+    138305463UL, 57679255UL,  155986874UL, 146126224UL, 192116416UL, 159343757UL, 7847295UL,   23389493UL,  166676672UL,
+    226374807UL, 132005339UL, 242611566UL, 79585224UL,  30798498UL,  164385330UL, 104601155UL, 115454726UL, 6986525UL,
+    249615711UL, 166432804UL, 76402577UL,  101943931UL, 69443802UL,  113572857UL, 251395934UL, 110355233UL, 82649541UL,
+    5181182UL,   114907097UL, 3352547UL,   48348354UL,  74816733UL,  147391793UL, 39357571UL,  175456264UL, 74801424UL,
+    193169361UL, 58836410UL,  116952433UL, 64805669UL,  72565246UL,  118555417UL, 113988155UL, 180940505UL, 214525759UL,
+    235470390UL, 28338690UL,  146522892UL, 121732844UL, 6164871UL,   205683483UL, 36207146UL,  70897634UL,  263487283UL,
+    178610719UL, 107393845UL, 3085809UL,   23211910UL,  176249315UL, 12322286UL,  68311881UL,  49055879UL,  90382085UL,
+    6341301UL,   214104361UL, 73560148UL,  133779805UL, 1323582UL,   160980912UL, 181885288UL, 45270341UL,  100819826UL,
+    72403130UL,  18162494UL,  89437417UL,  246580510UL, 102465956UL, 171491231UL, 254189300UL, 208861383UL, 31830682UL,
+    240690097UL, 187516055UL, 12872574UL,  201882808UL, 145792311UL, 57683492UL,  177329652UL, 198530715UL, 143430914UL,
+    89262127UL,  209510152UL, 60992191UL,  200223039UL, 83305221UL,  60477910UL,  168415084UL, 227069279UL, 236136736UL,
+    150460168UL, 7883095UL,   222879832UL, 52964552UL,  12566154UL,  181448066UL, 77990720UL,  239340350UL, 9879624UL,
+    123342751UL, 227895591UL, 249040943UL, 85718637UL,  244966334UL, 209175903UL, 127866906UL, 222459618UL, 106559273UL,
+    134707021UL, 161760965UL, 139861237UL, 36954968UL,  141072986UL, 66039746UL,  36249223UL,  201418191UL, 192766552UL,
+    159356653UL, 75900703UL,  199067627UL, 123030369UL, 216703081UL, 37939303UL,  113221071UL, 169904815UL, 112410691UL,
+    56409122UL,  40818812UL,  161968272UL, 28694096UL,  15527070UL,  68507926UL,  160030236UL, 38739698UL,  77600313UL,
+    71105159UL,  226138540UL, 107244033UL, 11058802UL,  255147554UL, 52170910UL,  170927544UL, 157406637UL, 28690021UL,
+    178667779UL, 18833983UL,  40225977UL,  139824414UL, 70164445UL,  128091560UL, 77555240UL,  117798288UL, 19983785UL,
+    162686475UL, 25554194UL,  175339871UL, 223492630UL, 61291916UL,  65590205UL,  150940151UL, 181191169UL, 159848968UL,
+    201933400UL, 136254891UL, 57165860UL,  257875172UL, 25483850UL,  44157645UL,  29543087UL,  57005603UL,  254631406UL,
+    262316476UL, 163343532UL, 106713994UL, 67873636UL,  67330989UL,  121558153UL, 118280355UL, 97493975UL,  209234539UL,
+    248507513UL, 208676867UL, 31372606UL,  140531626UL, 121453589UL, 161745254UL, 190947201UL, 10518486UL,  168484710UL,
+    26886992UL,  221570138UL, 185172071UL, 226892887UL, 106303827UL, 140731027UL, 193424168UL, 100045964UL, 136072736UL,
+    216755286UL, 256294396UL, 234685486UL, 135918472UL, 124927805UL, 235351014UL, 1121175UL,   25175481UL,  186265163UL,
+    95967545UL,  224202652UL, 160151076UL, 191335355UL, 185800509UL, 250125556UL, 175348481UL, 213191044UL, 202937442UL,
+    190869288UL, 180559712UL, 83999285UL,  72924590UL,  242066533UL, 134699320UL, 229557849UL, 177777586UL, 95624306UL,
+    249891583UL, 215787782UL, 161224511UL, 194022557UL, 79354155UL,  176122928UL, 77767797UL,  192772433UL, 255261003UL,
+    225584968UL, 6003079UL,   93189860UL,  181281970UL, 203319371UL, 163422027UL, 210987437UL, 173353017UL, 177344721UL,
+    168534001UL, 218114856UL, 103258953UL, 139609018UL, 240926780UL, 187143554UL, 231812085UL, 106237533UL, 61841514UL,
+    74747721UL,  224766458UL, 29884847UL,  151065794UL, 205074763UL, 111195388UL, 2947730UL,   151061876UL, 119710989UL,
+    93467120UL,  259907592UL, 49024224UL,  68837149UL,  213447780UL, 38409675UL,  210624367UL, 186247218UL, 70468752UL,
+    212610729UL, 196991555UL, 178244798UL, 220858381UL, 45650286UL,  185337440UL, 182168000UL, 18840367UL,  81087728UL,
+    183967833UL, 112500121UL, 135738476UL, 70519562UL,  96712812UL,  182940467UL, 78064504UL,  240347506UL, 86664296UL,
+    157467672UL, 45503199UL,  221283603UL, 251747738UL, 10049330UL,  63106366UL,  225956297UL, 212999312UL, 102133966UL,
+    75165447UL,  28327342UL,  129686688UL, 227665114UL, 66848833UL,  34503858UL,  30599223UL,  203830549UL, 73201223UL,
+    62114421UL,  253832180UL, 191931190UL, 112865387UL, 111705678UL, 165604336UL, 107996664UL, 78797378UL,  28811360UL,
+    189612783UL, 127394167UL, 171776403UL, 236803577UL, 258025772UL, 242135756UL, 235086332UL, 216707321UL, 111803753UL,
+    107607826UL, 168212969UL, 18547289UL,  27615885UL,  104445804UL, 211651064UL, 108662742UL, 90909210UL,  16417981UL,
+    262234315UL, 208355146UL, 119390143UL, 71966813UL,  56219805UL,  240364691UL, 256769650UL, 199785986UL, 40298142UL,
+    222806231UL, 227432938UL, 131937532UL, 199867367UL, 112645454UL, 173685865UL, 71623109UL,  242765828UL, 153914834UL,
+    254527915UL, 203910673UL, 219729255UL, 223095083UL, 184299626UL, 170391581UL, 119934878UL, 40439387UL,  75639015UL,
+    222772568UL, 16764691UL,  159207219UL, 201609999UL, 38355749UL,  105648406UL, 8926237UL,   104314076UL, 196405385UL,
+    90259967UL,  114836075UL, 174684963UL, 226844224UL, 97093268UL,  178827324UL, 181614570UL, 207910569UL, 85060966UL,
+    143280409UL, 240206041UL, 163069940UL, 47364978UL,  20772135UL,  137824962UL, 205092815UL, 4213665UL,   84935809UL,
+    155525531UL, 74526687UL,  259181526UL, 33873677UL,  161419375UL, 150306002UL, 249137332UL, 48263544UL,  188972867UL,
+    52461791UL,  157773290UL, 221584845UL, 166512633UL, 52864897UL,  154865303UL, 9439090UL,   104785297UL, 80623169UL,
+    85855072UL,  116086019UL, 178233639UL, 100413076UL, 44732266UL,  27459378UL,  8756524UL,   148434432UL, 165955284UL,
+    240176863UL, 207245111UL, 266582715UL, 198901342UL, 219252520UL, 155023268UL, 261557210UL, 68442065UL,  76391805UL,
+    158790705UL, 191213833UL, 141071202UL, 105192617UL, 68252101UL,  2930817UL,   136951604UL, 259168301UL, 196947450UL,
+    149140396UL, 249994371UL, 248273225UL, 117475185UL, 138834144UL, 145596939UL, 143192203UL, 223965819UL, 134696235UL,
+    221418728UL, 247198469UL, 170828624UL, 12727889UL,  265578161UL, 98569672UL,  243439292UL, 239168954UL, 154472069UL,
+    134979022UL, 7540615UL,   197822622UL, 78155571UL,  182299249UL, 148552839UL, 153583050UL, 75948675UL,  95580862UL,
+    259481350UL, 17256423UL,  80254417UL,  153406830UL, 143427544UL, 105716009UL, 149147177UL, 9780770UL,   126110212UL,
+    76433890UL,  153670412UL, 156183784UL, 77816249UL,  101025789UL, 212349933UL, 170056145UL, 265847129UL, 106957150UL,
+    121295906UL, 154347187UL, 184837844UL, 17535589UL,  159408047UL, 50277492UL,  50147156UL,  92649253UL,  39938516UL,
+    81483127UL,  48652025UL,  260875915UL, 97020668UL,  106660877UL, 109008825UL, 193867255UL, 2574541UL,   236888695UL,
+    231181431UL, 217814607UL, 254880582UL, 103886653UL, 169260529UL, 221858909UL, 247024521UL, 5942822UL,   237457689UL,
+    242073889UL, 190063899UL, 232994880UL, 215590736UL, 14425692UL,  162972777UL, 44731208UL,  24696256UL,  94907604UL,
+    262514865UL, 119587687UL, 159862107UL, 21275351UL,  96701983UL,  253766881UL, 99364111UL,  123907393UL, 123143714UL,
+    117722372UL, 99070023UL,  128557445UL, 83045394UL,  133789579UL, 90797352UL,  219648755UL, 201921925UL, 120003355UL,
+    15942442UL,  217033835UL, 253994826UL, 19073798UL,  75034211UL,  88653496UL,  68271269UL,  244512859UL, 53998738UL,
+    63039367UL,  167925219UL, 58973068UL,  94842042UL,  123699018UL, 67220908UL,  258048285UL, 145091646UL, 38265956UL,
+    25455342UL,  87881556UL,  187388325UL, 60935228UL,  70721958UL,  210227092UL, 20502504UL,  69371689UL,  120267339UL,
+    89369430UL,  24644113UL,  78088333UL,  42500067UL,  42797695UL,  266786096UL, 225753046UL, 180947878UL, 210904165UL,
+    109033007UL, 39579525UL,  69394610UL,  137716126UL, 256493494UL, 96786635UL,  88405954UL,  39765758UL,  7742052UL,
+    171794323UL, 217467011UL, 3331891UL,   179321108UL, 241011056UL, 44840613UL,  115821640UL, 172738685UL, 143174695UL,
+    228296532UL, 119588640UL, 83380441UL,  217128808UL, 196310874UL, 178476177UL, 72913064UL,  95596133UL,  249735220UL,
+    263459751UL, 141550886UL, 101895406UL, 202042047UL, 2776896UL,   154086376UL, 173287067UL, 256628943UL, 229571745UL,
+    204369737UL, 255622925UL, 192055702UL, 151566108UL, 130447384UL, 102201648UL, 65145836UL,  147417754UL, 216176634UL,
+    125806104UL, 26560492UL,  37464476UL,  28074177UL,  131640660UL, 225833563UL, 117259437UL, 111372462UL, 114087564UL,
+    207678510UL, 225718087UL, 265049924UL, 82320607UL,  213029389UL, 262441168UL, 257476461UL, 11069087UL,  155308220UL,
+    100529406UL, 118572453UL, 10644976UL,  21502024UL,  233213032UL, 99916985UL,  247888404UL, 175706864UL, 180765632UL,
+    7377260UL,   181406898UL, 118631093UL, 13486497UL,  131352713UL, 15935021UL,  120093526UL, 54544682UL,  243591068UL,
+    31616510UL,  165685816UL, 204119286UL, 191539323UL, 66364466UL,  1655879UL,   185561343UL, 120266806UL, 44002829UL,
+    30626348UL,  42644405UL,  2975809UL,   12207343UL,  155797481UL, 24232743UL,  82944085UL,  141042083UL, 21495228UL,
+    259077775UL, 59554658UL,  64524953UL,  102430255UL, 23994337UL,  159869735UL, 41027761UL,  39256517UL,  29063440UL,
+    170840303UL, 162612422UL, 181428286UL, 237394450UL, 195939965UL, 78232318UL,  135030024UL, 64010018UL,  99932436UL,
+    168140487UL, 65407902UL,  150903219UL, 53647391UL,  229468250UL, 104940578UL, 58840474UL,  208540017UL, 125315921UL,
+    87372919UL,  244216311UL, 186309588UL, 199784346UL, 45655850UL,  205486787UL, 148672258UL, 198710109UL, 190007669UL,
+    265566453UL, 19557904UL,  249796141UL, 34824956UL,  146794177UL, 165061808UL, 114865205UL, 157733394UL, 185198686UL,
+    218138446UL, 220716123UL, 168098582UL, 70407404UL,  57588730UL,  157544271UL, 172960010UL, 8164402UL,   86800699UL,
+    62991791UL,  114721587UL, 157791591UL, 59388454UL,  203598662UL, 142022653UL, 93072727UL,  127057730UL, 262550116UL,
+    229880065UL, 186768599UL, 127199977UL, 172340765UL, 224364295UL, 70360260UL,  259436238UL, 68266380UL,  3357428UL,
+    143554626UL, 232714074UL, 245840240UL, 106809279UL, 264178003UL, 105925632UL, 113435886UL, 78294444UL,  110223844UL,
+    226018644UL, 100772360UL, 63510605UL,  90586588UL,  242749833UL, 125800631UL, 255914671UL, 194536201UL, 82449527UL,
+    242231095UL, 88746768UL,  98398433UL,  14013568UL,  49128937UL,  264950684UL, 94891647UL,  22613640UL,  104082110UL,
+    156661020UL, 102696140UL, 233605653UL, 202531667UL, 24854646UL,  267768891UL, 56300704UL,  224201347UL, 66592052UL,
+    112716934UL, 210607392UL, 142875485UL, 129255307UL, 247308285UL, 156418737UL, 181538977UL, 19362951UL,  105061518UL,
+    99396803UL,  140574893UL, 62517661UL,  138687132UL, 219870797UL, 207170130UL, 17189384UL,  218898104UL, 246457744UL,
+    255863421UL, 148780656UL, 208539585UL, 167171471UL, 109568277UL, 159537664UL, 265251545UL, 44806199UL,  59182302UL,
+    231814479UL, 119033109UL, 14916181UL,  106620219UL, 236696325UL, 29636508UL,  248423205UL, 172978906UL, 181025523UL,
+    206782868UL, 65506378UL,  146722118UL, 195485782UL, 222165166UL, 198998516UL, 177217953UL, 59577415UL,  29869415UL,
+    158897151UL, 103473097UL, 233584027UL, 154666621UL, 237771549UL, 233644433UL, 37597929UL,  266599844UL, 257106847UL,
+    108908918UL, 10193642UL,  59308571UL,  209914195UL, 92679896UL,  40930471UL,  128314348UL, 156378322UL, 19542208UL,
+    227537283UL, 130015135UL, 169466613UL, 264694932UL, 114372700UL, 100984367UL, 142456710UL, 66292286UL,  240802457UL,
+    90515909UL,  43043976UL,  43120805UL,  192529096UL, 261661869UL, 72212694UL,  103628264UL, 264429146UL, 6208526UL,
+    89848744UL,  136897381UL, 14541500UL,  89092030UL,  159488755UL, 53425861UL,  130295902UL, 162026040UL, 220610040UL,
+    83085888UL,  163553140UL, 171317636UL, 44066254UL,  233035129UL, 245315122UL, 139205222UL, 77999701UL,  229314546UL,
+    231697761UL, 234338312UL, 167690191UL, 44830050UL,  146113839UL, 32963649UL,  93351385UL,  143956422UL, 226879581UL,
+    77473353UL,  259666714UL, 92765295UL,  226544404UL, 174045736UL, 265180075UL, 264121701UL, 172461826UL, 256364446UL,
+    209165722UL, 173053439UL, 180678371UL, 178713714UL, 156705126UL, 252006235UL, 6813007UL,   1350602UL,   42809333UL,
+    92885190UL,  118894337UL, 39445128UL,  67735222UL,  214554508UL, 43511993UL,  153905159UL, 108809538UL, 239650989UL,
+    80879637UL,  241990139UL, 121761454UL, 85468564UL,  68226171UL,  230416212UL, 153626801UL, 185369102UL, 101353634UL,
+    506549UL,    262188114UL, 160797711UL, 257744374UL, 63621531UL,  211967303UL, 162956338UL, 54690112UL,  66274928UL,
+    243338757UL, 68798547UL,  103077573UL, 114987446UL, 241202280UL, 242218245UL, 248867765UL, 119266624UL, 226690590UL,
+    30901085UL,  252574225UL, 237052048UL, 34137105UL,  110304765UL, 129107947UL, 176491518UL, 104144053UL, 230445964UL,
+    153633282UL, 54948892UL,  140207637UL, 10760795UL,  246214742UL, 32300437UL,  48419083UL,  4797785UL,   108014216UL,
+    107075014UL, 45814182UL,  221458224UL, 163035402UL, 134893410UL, 242387251UL, 243225718UL, 33518764UL,  218046628UL,
+    5590961UL,   158898289UL, 81904706UL,  266969815UL, 132090915UL, 4579323UL,   187559302UL, 113892305UL, 149179733UL,
+    61693040UL,  235581186UL, 103510760UL, 187304414UL, 209857751UL, 85536511UL,  205192491UL, 48899792UL,  114663485UL,
+    149318066UL, 167833850UL, 247956742UL, 126726247UL, 54508455UL,  165446165UL, 162066743UL,
+};
+uint32_t rand_arr_29_b29_w32_arr[1024] = {
+    45699670UL,  382755887UL, 282402897UL, 441672207UL, 297767795UL, 350847712UL, 262308909UL, 218905204UL, 100557281UL,
+    505973513UL, 371977601UL, 33885865UL,  148829360UL, 92129066UL,  97451185UL,  210329968UL, 224752458UL, 251316834UL,
+    214087453UL, 339570636UL, 147960320UL, 359539387UL, 186727275UL, 142029501UL, 97571889UL,  392112073UL, 231997914UL,
+    95236117UL,  361839573UL, 134941756UL, 528942371UL, 12671228UL,  452728451UL, 465551226UL, 183183821UL, 94192487UL,
+    419866697UL, 511518525UL, 486387002UL, 108602250UL, 188882360UL, 468113910UL, 127477789UL, 499908299UL, 219777449UL,
+    393256005UL, 151627330UL, 190383458UL, 140978750UL, 377285740UL, 426963898UL, 291200230UL, 197042601UL, 293440344UL,
+    187342398UL, 119398467UL, 14011160UL,  280888684UL, 103418933UL, 535928362UL, 443296955UL, 95282820UL,  256538647UL,
+    345925600UL, 243009525UL, 234522767UL, 248912911UL, 430667781UL, 401104073UL, 331627617UL, 383223275UL, 190986570UL,
+    298731149UL, 321140842UL, 454117084UL, 435293538UL, 210581327UL, 216683617UL, 233833816UL, 213207862UL, 193397372UL,
+    121941508UL, 458748642UL, 531473372UL, 254467365UL, 443125375UL, 264414994UL, 472481644UL, 411598398UL, 517055221UL,
+    269314849UL, 192551579UL, 408633297UL, 258784147UL, 347118371UL, 379589968UL, 254982033UL, 208038109UL, 86604830UL,
+    61531092UL,  32257940UL,  458244063UL, 4008585UL,   276612135UL, 216405481UL, 224922858UL, 451452638UL, 476044760UL,
+    135044727UL, 360343141UL, 215619825UL, 341398521UL, 92302985UL,  280520475UL, 94998679UL,  212994329UL, 430092819UL,
+    82963111UL,  90163439UL,  193756104UL, 437951387UL, 395157486UL, 451929066UL, 513703138UL, 477797208UL, 107485390UL,
+    25424382UL,  3046418UL,   394531858UL, 24219539UL,  491565650UL, 381236873UL, 255019090UL, 470354060UL, 346159651UL,
+    349469362UL, 277609400UL, 418662675UL, 213210913UL, 522383778UL, 406831585UL, 167324643UL, 145253540UL, 181284900UL,
+    321452670UL, 161232933UL, 13443878UL,  200280204UL, 196855749UL, 18870696UL,  536634760UL, 110384934UL, 98776897UL,
+    225195199UL, 493915751UL, 365061347UL, 407963298UL, 241555589UL, 271137744UL, 412022689UL, 200847546UL, 505375136UL,
+    218569853UL, 382405946UL, 8704543UL,   523041072UL, 111185012UL, 434714505UL, 111906872UL, 373231096UL, 340438823UL,
+    530501280UL, 323390920UL, 513780453UL, 400905466UL, 377245465UL, 10746878UL,  177292145UL, 303167345UL, 69905805UL,
+    36822543UL,  317280840UL, 10181576UL,  210545841UL, 347460123UL, 72192770UL,  503431664UL, 441785884UL, 371383744UL,
+    220291120UL, 110957134UL, 389179899UL, 229207243UL, 393533667UL, 246392956UL, 178016106UL, 173124539UL, 513342867UL,
+    134488383UL, 67970418UL,  195245787UL, 296696349UL, 337025885UL, 121459459UL, 465436816UL, 264705304UL, 62717203UL,
+    296684333UL, 381993992UL, 196151923UL, 348722482UL, 90052045UL,  185828851UL, 523461439UL, 456550236UL, 183356083UL,
+    498833330UL, 179592119UL, 98394349UL,  140874541UL, 309616313UL, 183321751UL, 399005795UL, 214726254UL, 218993069UL,
+    319038469UL, 532078968UL, 108559591UL, 492328400UL, 395261149UL, 79703078UL,  135694314UL, 275339852UL, 301671256UL,
+    116904105UL, 61783514UL,  254476012UL, 254774163UL, 108938973UL, 281809383UL, 225398636UL, 113810615UL, 472261259UL,
+    203874575UL, 514658786UL, 370253073UL, 495936235UL, 120976782UL, 536127244UL, 446128549UL, 428750657UL, 137699026UL,
+    102771497UL, 119288690UL, 437485004UL, 313113637UL, 43313853UL,  338372290UL, 492256799UL, 317198714UL, 524652501UL,
+    534050218UL, 237062323UL, 407330485UL, 36334149UL,  512378423UL, 321053229UL, 521222516UL, 60174409UL,  263391678UL,
+    286335529UL, 6951408UL,   376213348UL, 314356695UL, 282682451UL, 520215616UL, 351424302UL, 413380045UL, 466434335UL,
+    459797951UL, 147823000UL, 152069371UL, 512593050UL, 179036953UL, 425856875UL, 435233950UL, 363655828UL, 488617089UL,
+    403110830UL, 522444464UL, 206179897UL, 6411633UL,   463853400UL, 253614667UL, 111463437UL, 362011678UL, 296839766UL,
+    134473869UL, 446612037UL, 110201923UL, 429393328UL, 90841856UL,  376605490UL, 93848797UL,  53232777UL,  188576377UL,
+    156564097UL, 491082728UL, 75586828UL,  232097060UL, 392787345UL, 393664648UL, 481711859UL, 495793026UL, 359859213UL,
+    130437808UL, 352871080UL, 92649133UL,  103105845UL, 190029901UL, 26602067UL,  320781805UL, 237353099UL, 316254707UL,
+    460823298UL, 315073927UL, 292687529UL, 64947334UL,  126418530UL, 411070682UL, 15986359UL,  503145679UL, 43528203UL,
+    6257958UL,   300932377UL, 113416046UL, 16650892UL,  142149361UL, 142619549UL, 189974657UL, 353831952UL, 503918985UL,
+    494356749UL, 367190328UL, 313082955UL, 10767661UL,  139666088UL, 487110372UL, 98924530UL,  337441214UL, 22526488UL,
+    232130325UL, 247859029UL, 94510314UL,  60106916UL,  27236211UL,  183931509UL, 219082973UL, 49854948UL,  413263600UL,
+    460161588UL, 211344913UL, 111827218UL, 112733457UL, 391273023UL, 470210587UL, 277167975UL, 385141350UL, 13009364UL,
+    204520538UL, 115133754UL, 446425659UL, 432378970UL, 476914774UL, 47095503UL,  532016646UL, 190771073UL, 188481180UL,
+    396914885UL, 106355194UL, 317978870UL, 463040397UL, 18720470UL,  43725065UL,  503248381UL, 470927051UL, 489067005UL,
+    470286366UL, 381882736UL, 499162643UL, 475940555UL, 154791882UL, 331075911UL, 350597731UL, 211225811UL, 272580055UL,
+    497623754UL, 488131949UL, 501117279UL, 92131404UL,  61186259UL,  320271677UL, 307937390UL, 285815923UL, 432932742UL,
+    497442906UL, 534932104UL, 9026901UL,   248050491UL, 71119434UL,  158185042UL, 166904374UL, 81792214UL,  213480484UL,
+    99991832UL,  403719897UL, 268471871UL, 89916584UL,  474320518UL, 153256420UL, 523000999UL, 389438479UL, 283698149UL,
+    400402364UL, 248854361UL, 473050832UL, 115509741UL, 528014544UL, 523326708UL, 21112171UL,  217059731UL, 490231076UL,
+    107573027UL, 82424028UL,  323201802UL, 85179875UL,  192845251UL, 99913766UL,  398207408UL, 343777285UL, 103468787UL,
+    417698509UL, 320481096UL, 408075392UL, 9167353UL,   123058262UL, 390910404UL, 385010400UL, 499009620UL, 153183416UL,
+    62813576UL,  16633333UL,  511422335UL, 216584448UL, 527052698UL, 271845704UL, 211851895UL, 460498537UL, 167703590UL,
+    208748200UL, 293451303UL, 359079961UL, 518011770UL, 293325097UL, 232744915UL, 449140650UL, 245058304UL, 403338117UL,
+    139639442UL, 18704760UL,  14449934UL,  313113080UL, 273034868UL, 136596312UL, 290038242UL, 416206655UL, 21621771UL,
+    226334427UL, 156220700UL, 485868150UL, 168290995UL, 490192912UL, 503492538UL, 271685269UL, 363533061UL, 346080304UL,
+    100472778UL, 512084223UL, 120493455UL, 147216803UL, 485962998UL, 105838346UL, 118186820UL, 199669448UL, 227448441UL,
+    362084982UL, 106372929UL, 475101243UL, 414368199UL, 325489091UL, 95135385UL,  473695810UL, 370382914UL, 305281087UL,
+    195515913UL, 72263698UL,  18469452UL,  435019096UL, 517135488UL, 39084224UL,  132257275UL, 32302121UL,  534010300UL,
+    81459796UL,  500643518UL, 53978881UL,  238305225UL, 219738258UL, 32595304UL,  95595219UL,  278970689UL, 510814110UL,
+    242976444UL, 358417647UL, 420632187UL, 380676785UL, 91762447UL,  181278598UL, 485112475UL, 83586642UL,  322744271UL,
+    316892504UL, 435900951UL, 164201169UL, 466643459UL, 140659080UL, 181725216UL, 215762171UL, 44229360UL,  226158878UL,
+    47041102UL,  216886822UL, 150997445UL, 168942456UL, 515886931UL, 201267211UL, 531677090UL, 41255777UL,  306453372UL,
+    252988869UL, 52139113UL,  165076479UL, 328790641UL, 275841276UL, 422035634UL, 496528268UL, 384903679UL, 22644508UL,
+    291355136UL, 499326514UL, 13069833UL,  90179111UL,  331774242UL, 153252622UL, 340848775UL, 103981089UL, 230362226UL,
+    306147021UL, 87721420UL,  21099966UL,  314445864UL, 47245133UL,  514655452UL, 28183511UL,  380199846UL, 417610466UL,
+    106737192UL, 380713278UL, 495129144UL, 100813462UL, 33522492UL,  341665186UL, 58085994UL,  181949365UL, 10892387UL,
+    126875854UL, 296748590UL, 57872592UL,  432801009UL, 43342298UL,  507984430UL, 137012484UL, 531088444UL, 488214465UL,
+    227893008UL, 228464099UL, 299107936UL, 526319993UL, 170443667UL, 442281275UL, 194122602UL, 517118541UL, 231550284UL,
+    385919205UL, 498551923UL, 259322768UL, 329411402UL, 279811027UL, 137846127UL, 103740744UL, 436529339UL, 273795192UL,
+    470877634UL, 327347937UL, 147193038UL, 393416253UL, 485962845UL, 238765953UL, 350575419UL, 285138083UL, 250153864UL,
+    524324188UL, 132268533UL, 300385957UL, 318129786UL, 401413712UL, 489699355UL, 216200469UL, 162452166UL, 469252685UL,
+    151897921UL, 226454516UL, 338386497UL, 359021188UL, 5304038UL,   102887787UL, 324860312UL, 486921144UL, 146344316UL,
+    6960485UL,   524519869UL, 28626UL,     184505259UL, 84968696UL,  391292266UL, 403251581UL, 22877099UL,  325234335UL,
+    94260778UL,  334129634UL, 129109537UL, 107961478UL, 515516157UL, 32025086UL,  344347001UL, 416591010UL, 238525252UL,
+    140161595UL, 359518729UL, 403185055UL, 98413656UL,  99974845UL,  115935543UL, 280154780UL, 444276730UL, 346458530UL,
+    501363846UL, 176126376UL, 41795207UL,  351662251UL, 256544053UL, 92406681UL,  446432085UL, 431135649UL, 403130677UL,
+    91824375UL,  258841993UL, 21698159UL,  300636226UL, 23077383UL,  19385555UL,  175761289UL, 274063018UL, 98476288UL,
+    387816725UL, 421352061UL, 79988862UL,  16069UL,     447082632UL, 443688338UL, 295360910UL, 388529118UL, 352163573UL,
+    156104675UL, 361998778UL, 225129747UL, 527038516UL, 22618586UL,  149267429UL, 266224157UL, 26771075UL,  45723420UL,
+    141701908UL, 531065062UL, 124946082UL, 62180401UL,  20735022UL,  513798619UL, 67251975UL,  190484140UL, 362830059UL,
+    37170684UL,  304247560UL, 403271590UL, 56477347UL,  189058038UL, 509347385UL, 350354020UL, 35574858UL,  441654548UL,
+    406592581UL, 66098421UL,  152323905UL, 284027560UL, 493706456UL, 2938667UL,   431891432UL, 9546919UL,   233270740UL,
+    139657321UL, 173640267UL, 127453357UL, 460223743UL, 15450306UL,  234880179UL, 93543693UL,  110377298UL, 390483109UL,
+    185854596UL, 500009375UL, 525848570UL, 2531078UL,   477595717UL, 503498995UL, 138667355UL, 312612285UL, 221391584UL,
+    129568556UL, 119876713UL, 90749751UL,  268057162UL, 345157954UL, 211913834UL, 522448264UL, 4692918UL,   172391775UL,
+    181733806UL, 348744094UL, 459062620UL, 473280688UL, 76723993UL,  220186294UL, 211240865UL, 260710273UL, 351759254UL,
+    472215998UL, 285168276UL, 321929946UL, 3142831UL,   27458383UL,  352194418UL, 97706397UL,  466997719UL, 481821659UL,
+    134359597UL, 378841980UL, 235347827UL, 387958244UL, 426323337UL, 111935582UL, 245075448UL, 75514943UL,  462148169UL,
+    287986691UL, 199728904UL, 216499009UL, 266778839UL, 402426176UL, 342283195UL, 5995579UL,   503765015UL, 126155827UL,
+    77100977UL,  286937920UL, 97242627UL,  316955686UL, 260890668UL, 369861517UL, 242203144UL, 20301280UL,  384945098UL,
+    235024343UL, 204026507UL, 323631645UL, 303513550UL, 7885585UL,   323213447UL, 65428224UL,  287332164UL, 203047984UL,
+    337508976UL, 555612UL,    380109627UL, 368239866UL, 232557768UL, 436468674UL, 309199450UL, 456593201UL, 230146437UL,
+    359136675UL, 186459339UL, 162094737UL, 310291459UL, 329585773UL, 8926231UL,   12684301UL,  445973351UL, 494560462UL,
+    353638856UL, 449330507UL, 510979227UL, 300630408UL, 63690584UL,  327731064UL, 306257829UL, 365013688UL, 235130290UL,
+    47736037UL,  81137525UL,  481702964UL, 531113926UL, 348608282UL, 303264862UL, 470470385UL, 227994468UL, 345183686UL,
+    113279162UL, 524306701UL, 22758697UL,  195309750UL, 328805391UL, 235269533UL, 378772361UL, 28062374UL,  328900917UL,
+    89269699UL,  326409436UL, 531993028UL, 215131038UL, 158871687UL, 239927507UL, 521139377UL, 8283535UL,   446156897UL,
+    14091727UL,  255917819UL, 30948139UL,  305792991UL, 73030951UL,  239182416UL, 147719055UL, 24380962UL,  455616652UL,
+    58180384UL,  387088980UL, 15495686UL,  340745313UL, 151967234UL, 385287039UL, 235886020UL, 394567952UL, 240178032UL,
+    97787811UL,  62291206UL,  464564935UL, 418464962UL, 179682418UL, 317253083UL, 40115867UL,  364415280UL, 305227100UL,
+    296546536UL, 464555990UL, 25398168UL,  498048521UL, 244283204UL, 247142701UL, 286595000UL, 130970707UL, 253975834UL,
+    341507728UL, 294100899UL, 349515927UL, 241474190UL, 350639474UL, 248655529UL, 256952155UL, 299499749UL, 46267478UL,
+    242895206UL, 425508435UL, 363539687UL, 332714563UL, 506151616UL, 454451741UL, 249409535UL, 82030016UL,  440413747UL,
+    150035989UL, 522409782UL, 111018577UL, 150912697UL, 513008663UL, 465574237UL, 317371035UL, 443454769UL, 425168874UL,
+    56490171UL,  192190644UL, 212037490UL, 2429395UL,   75357638UL,  25948479UL,  510043108UL, 453838212UL, 104537703UL,
+    362988027UL, 177161481UL, 25313465UL,  21458949UL,  310759899UL, 294360545UL, 90708793UL,  178356117UL, 393327108UL,
+    144183651UL, 178032261UL, 440388411UL, 59602752UL,  47945537UL,  82076050UL,  268322716UL, 49571448UL,  51728519UL,
+    315773431UL, 332168448UL, 16198209UL,  434721847UL, 424278367UL, 129685027UL, 37014922UL,  153375978UL, 137181760UL,
+    336780393UL, 36009040UL,  347564669UL, 244113231UL, 128031518UL, 50650212UL,  257112042UL, 433759975UL, 306567711UL,
+    348086374UL, 240636975UL, 318955187UL, 84289427UL,  51154502UL,  475695794UL, 227810737UL, 8373561UL,   262974813UL,
+    452461372UL, 313685927UL, 27281036UL,  40310183UL,  272646998UL, 156052529UL, 519474796UL, 341096312UL, 242818849UL,
+    194265418UL, 531544619UL, 66899011UL,  63518471UL,  350270064UL, 106514438UL, 461697505UL, 74028520UL,  308355261UL,
+    224674664UL, 35421899UL,  437387583UL, 368677521UL, 222389015UL, 399088596UL, 149487281UL, 10777259UL,  307918609UL,
+    66806414UL,  37262967UL,  408143946UL, 163780065UL, 62127628UL,  414506741UL, 497214100UL, 471773168UL, 129146461UL,
+    36719600UL,  329840989UL, 184373337UL, 368510791UL, 44770390UL,  14897302UL,  371933809UL,
+};
+uint32_t rand_arr_30_b30_w32_arr[1024] = {
+    244637473UL,  138957967UL,  178561475UL,  856080074UL,  19815316UL,   310063087UL,  970612111UL,  594247575UL,
+    273093450UL,  348502149UL,  595494693UL,  467890782UL,  812019838UL,  189780763UL,  542721797UL,  919444894UL,
+    687990401UL,  389486629UL,  102885041UL,  295617673UL,  168836333UL,  350166346UL,  714413130UL,  32861865UL,
+    839626667UL,  865310680UL,  446512097UL,  1072960909UL, 49110457UL,   1017166880UL, 641760393UL,  781761894UL,
+    1022384680UL, 563233169UL,  117151966UL,  844763117UL,  999385321UL,  912595085UL,  1071984643UL, 695365747UL,
+    571934342UL,  477495123UL,  249110595UL,  319937812UL,  875635915UL,  21986992UL,   813458311UL,  197019782UL,
+    523178158UL,  1051023097UL, 793288387UL,  183403959UL,  510923067UL,  676871125UL,  716116391UL,  493342751UL,
+    169044121UL,  1048428097UL, 52712305UL,   767612267UL,  432572273UL,  527025742UL,  853002755UL,  297959686UL,
+    743214286UL,  918869350UL,  346923878UL,  211923834UL,  244513439UL,  524982337UL,  819607255UL,  185958272UL,
+    977023357UL,  174011470UL,  909357852UL,  61078620UL,   747166524UL,  371536614UL,  335091196UL,  877116341UL,
+    987033419UL,  958512870UL,  977220777UL,  134021379UL,  1060102938UL, 360920152UL,  952284601UL,  941638921UL,
+    675749182UL,  364018443UL,  590966996UL,  944088768UL,  513129272UL,  696244692UL,  186589425UL,  201584851UL,
+    594526069UL,  446181113UL,  737016590UL,  35013409UL,   160883347UL,  816270524UL,  151154469UL,  655034124UL,
+    698804655UL,  15534406UL,   41469360UL,   254583305UL,  79397971UL,   461953726UL,  817358936UL,  807031489UL,
+    269745078UL,  1029948299UL, 166164670UL,  771987593UL,  265225696UL,  633149387UL,  1031003729UL, 782284322UL,
+    995180177UL,  815308887UL,  1064339083UL, 172818960UL,  699838238UL,  513140371UL,  650965389UL,  1033008005UL,
+    930311878UL,  800362128UL,  588102857UL,  334264432UL,  590066830UL,  301406319UL,  274808410UL,  927906542UL,
+    74549291UL,   37806198UL,   821989852UL,  389175350UL,  752693039UL,  236340162UL,  868215802UL,  176128981UL,
+    628632205UL,  304975618UL,  414966185UL,  1057905804UL, 389255205UL,  354315230UL,  104035677UL,  371660632UL,
+    551931509UL,  1012161992UL, 90588519UL,   609020411UL,  585280087UL,  491365792UL,  279042750UL,  933551739UL,
+    93225125UL,   870729297UL,  429502184UL,  382640975UL,  822944667UL,  137362320UL,  733584167UL,  939972354UL,
+    790840021UL,  927536798UL,  1070529732UL, 504140716UL,  804602638UL,  564683061UL,  767201177UL,  318617314UL,
+    380267071UL,  31707269UL,   234344111UL,  479651086UL,  630466809UL,  347945312UL,  512646637UL,  549820788UL,
+    664834211UL,  997480074UL,  495477188UL,  419899310UL,  655866996UL,  90111789UL,   21394364UL,   759405404UL,
+    1054410876UL, 795377603UL,  998151337UL,  57006564UL,   996771277UL,  237016338UL,  307959608UL,  248804138UL,
+    837998850UL,  357989376UL,  839580564UL,  88589460UL,   971700065UL,  160638539UL,  306022281UL,  693088301UL,
+    542264464UL,  590550723UL,  102941096UL,  452363259UL,  39159591UL,   4683074UL,    150322927UL,  494906541UL,
+    621022861UL,  128739411UL,  813856273UL,  590781374UL,  745820424UL,  38927798UL,   64299509UL,   123271527UL,
+    1008894011UL, 242685591UL,  948724184UL,  641066742UL,  1030962226UL, 94021230UL,   464092280UL,  957036244UL,
+    682388965UL,  49785990UL,   851584796UL,  231068487UL,  466732683UL,  789188218UL,  472460417UL,  1007687276UL,
+    958691603UL,  895822924UL,  821066479UL,  799665575UL,  144052896UL,  230580666UL,  684252857UL,  1068169422UL,
+    575256967UL,  942002555UL,  850196754UL,  597276302UL,  334492707UL,  187379620UL,  130019960UL,  440829754UL,
+    459676620UL,  815542108UL,  9841512UL,    793483691UL,  1019104360UL, 592631657UL,  471499453UL,  950444800UL,
+    150414208UL,  1019255860UL, 10186572UL,   983257055UL,  830748603UL,  437301382UL,  922255237UL,  734481046UL,
+    221720115UL,  848557640UL,  535184699UL,  662931927UL,  143388237UL,  748839683UL,  49569692UL,   832515593UL,
+    818437110UL,  836737684UL,  816209976UL,  385745682UL,  31737247UL,   252750657UL,  715040793UL,  653710269UL,
+    399255577UL,  459524550UL,  1063994298UL, 547038259UL,  606729277UL,  256963349UL,  629441386UL,  165073888UL,
+    938526881UL,  1059265237UL, 76109263UL,   567038836UL,  415953675UL,  313440801UL,  674567814UL,  969161587UL,
+    199115350UL,  319984853UL,  573314611UL,  403933885UL,  538170514UL,  377820790UL,  777191033UL,  250695837UL,
+    352243587UL,  7415713UL,    296017949UL,  670643581UL,  1045626349UL, 725095723UL,  955091032UL,  364031627UL,
+    624211885UL,  285190525UL,  619094139UL,  515861154UL,  1001381046UL, 988914828UL,  92450567UL,   640036080UL,
+    925636947UL,  468865094UL,  483784882UL,  810891973UL,  988470079UL,  792442636UL,  434115249UL,  997892196UL,
+    731248815UL,  1044530149UL, 716601748UL,  552691905UL,  388965445UL,  344472797UL,  712675204UL,  148558684UL,
+    1010317245UL, 512975157UL,  356057620UL,  739603378UL,  811333976UL,  65920842UL,   723996750UL,  741676334UL,
+    348147238UL,  343081207UL,  573555924UL,  1043180234UL, 1040057852UL, 723419206UL,  482127707UL,  715855587UL,
+    637665331UL,  292013501UL,  304084649UL,  583157550UL,  296256UL,     502059299UL,  471392920UL,  1027569889UL,
+    861875367UL,  622045804UL,  707576362UL,  84946117UL,   72995882UL,   282878111UL,  933693709UL,  589533411UL,
+    830801609UL,  180739203UL,  111312350UL,  707218875UL,  754570172UL,  476133279UL,  766069401UL,  46524399UL,
+    259421479UL,  795823217UL,  334543251UL,  354467438UL,  449277508UL,  463292413UL,  855351024UL,  433301881UL,
+    813538432UL,  247410757UL,  656699911UL,  659432449UL,  894943014UL,  995588163UL,  436106479UL,  214461413UL,
+    55884719UL,   196643482UL,  275548089UL,  764751849UL,  679605168UL,  312650527UL,  885139650UL,  677972984UL,
+    38709434UL,   82212720UL,   124577216UL,  762667493UL,  806536103UL,  1016672623UL, 714007068UL,  203310373UL,
+    338898154UL,  127116513UL,  1039175672UL, 266588524UL,  463500106UL,  248020760UL,  510210401UL,  557165903UL,
+    676248486UL,  177855930UL,  1021780260UL, 530281783UL,  1627834UL,    434098487UL,  561423448UL,  663286291UL,
+    136470669UL,  642627202UL,  965646277UL,  348036745UL,  35009465UL,   448869810UL,  836138570UL,  231213022UL,
+    836289388UL,  593902723UL,  478868499UL,  839196206UL,  897964057UL,  460770772UL,  61564534UL,   975649777UL,
+    238341088UL,  876586452UL,  638624827UL,  104679121UL,  103077180UL,  954139983UL,  178206521UL,  363467527UL,
+    227842918UL,  498780246UL,  329597638UL,  1007818472UL, 468334017UL,  549881469UL,  95028664UL,   954772477UL,
+    303209573UL,  720739451UL,  554905423UL,  230673734UL,  463490669UL,  1012355944UL, 765788457UL,  245271414UL,
+    824521569UL,  424992078UL,  355524149UL,  348160261UL,  655706758UL,  650705897UL,  989441473UL,  616755211UL,
+    842030596UL,  257904804UL,  469293740UL,  961540621UL,  911620960UL,  344287969UL,  12690280UL,   952439316UL,
+    581129291UL,  1002775258UL, 822534649UL,  970881936UL,  435699616UL,  71001606UL,   451504212UL,  5739870UL,
+    707955793UL,  1062000680UL, 910365237UL,  392277346UL,  406606643UL,  248336609UL,  101002915UL,  243579525UL,
+    760823074UL,  572795173UL,  757832117UL,  860178865UL,  18727036UL,   421582957UL,  1071651834UL, 861942202UL,
+    719970135UL,  615566202UL,  419003462UL,  339750160UL,  754236896UL,  886420606UL,  800644375UL,  461269774UL,
+    99042862UL,   401257762UL,  279832865UL,  177629641UL,  368569256UL,  431421757UL,  954970977UL,  741371996UL,
+    254841964UL,  1045811076UL, 917116179UL,  691355041UL,  793744525UL,  1025675703UL, 680898647UL,  302538231UL,
+    256409517UL,  369707460UL,  86482190UL,   83852987UL,   722501880UL,  572936844UL,  450454538UL,  311701606UL,
+    956906976UL,  117698547UL,  816035883UL,  128165065UL,  667180310UL,  893529339UL,  805814589UL,  975829106UL,
+    376194027UL,  889012043UL,  425708525UL,  223685537UL,  441889136UL,  896584598UL,  203856214UL,  270281367UL,
+    476363800UL,  848719191UL,  253843213UL,  475055575UL,  103552908UL,  862126143UL,  520613626UL,  446163374UL,
+    391798599UL,  314468676UL,  1055000444UL, 847652889UL,  224489267UL,  504269786UL,  549038617UL,  274036174UL,
+    461405143UL,  991344068UL,  334040255UL,  347585517UL,  156926751UL,  201559142UL,  615512206UL,  485309092UL,
+    369802807UL,  527620250UL,  716614199UL,  116330823UL,  852332168UL,  748542164UL,  835814216UL,  295939831UL,
+    453298302UL,  409376959UL,  260750796UL,  80143219UL,   171928597UL,  1018039533UL, 196985421UL,  107865275UL,
+    936637383UL,  834057572UL,  1049752808UL, 219647227UL,  1010146694UL, 910115355UL,  593969678UL,  640461689UL,
+    400424905UL,  186939158UL,  1023424463UL, 62494932UL,   79550989UL,   334283336UL,  354766328UL,  671537255UL,
+    762585625UL,  255246180UL,  295837649UL,  899708831UL,  872940596UL,  64979381UL,   952955146UL,  556588362UL,
+    429954240UL,  806008103UL,  276037566UL,  240142961UL,  615792083UL,  585069721UL,  539708528UL,  796043153UL,
+    967146004UL,  767950670UL,  733891852UL,  565759780UL,  473271168UL,  327320370UL,  953094831UL,  796549725UL,
+    929651237UL,  247125709UL,  222051866UL,  778956601UL,  288715318UL,  165028756UL,  481495696UL,  405837080UL,
+    203278401UL,  423409447UL,  1025560096UL, 2089585UL,    555812032UL,  392866377UL,  780685169UL,  207745598UL,
+    262819954UL,  216103821UL,  455006137UL,  1070784597UL, 438842929UL,  1043109872UL, 915564645UL,  627014954UL,
+    785571713UL,  204160447UL,  204666744UL,  451869008UL,  143741295UL,  509589544UL,  747695855UL,  205027782UL,
+    425752057UL,  487232084UL,  505805383UL,  85148490UL,   133829846UL,  634823292UL,  74396918UL,   665104146UL,
+    57225503UL,   952474739UL,  417025124UL,  782331400UL,  777837575UL,  915722403UL,  883539471UL,  143282405UL,
+    970998877UL,  641795348UL,  831939979UL,  797465788UL,  275779907UL,  403247435UL,  769646574UL,  975752646UL,
+    334894508UL,  377640072UL,  263745621UL,  263564968UL,  193435917UL,  862825073UL,  792780215UL,  268970142UL,
+    617405773UL,  503334334UL,  583177111UL,  460196665UL,  399034413UL,  777043216UL,  924392520UL,  480087111UL,
+    298419697UL,  795273214UL,  37642720UL,   597899351UL,  549319363UL,  158926479UL,  1049010845UL, 422473538UL,
+    86650141UL,   726217972UL,  245680962UL,  171906155UL,  201182540UL,  742486871UL,  531870465UL,  616740095UL,
+    475101930UL,  677884366UL,  801705813UL,  611951284UL,  234268317UL,  540330186UL,  30391088UL,   476101834UL,
+    553687342UL,  625506627UL,  273889922UL,  33122216UL,   7457832UL,    648269830UL,  466587476UL,  639927108UL,
+    1000226966UL, 502646879UL,  15421576UL,   620346834UL,  991380390UL,  1024287748UL, 441256943UL,  396698099UL,
+    97641466UL,   853709988UL,  220854621UL,  542789812UL,  673166701UL,  527558422UL,  283223850UL,  214570391UL,
+    300022756UL,  797091619UL,  894002426UL,  779807709UL,  759643748UL,  441560089UL,  860181666UL,  665077372UL,
+    909485406UL,  751740494UL,  1043412218UL, 819548770UL,  676730122UL,  434263139UL,  857930993UL,  1041441621UL,
+    20963306UL,   888137000UL,  79533459UL,   716752617UL,  999066764UL,  359443130UL,  503592942UL,  820046366UL,
+    880581091UL,  946124371UL,  715986215UL,  323963756UL,  1008609562UL, 1017918520UL, 419697953UL,  292352434UL,
+    113646560UL,  1013100474UL, 92202771UL,   907891277UL,  75559526UL,   1013048709UL, 298266247UL,  425356773UL,
+    178506941UL,  192305715UL,  775735569UL,  820286354UL,  74482184UL,   210518035UL,  153232714UL,  75406995UL,
+    679015427UL,  510545005UL,  503314791UL,  206623579UL,  561593909UL,  493336599UL,  615109328UL,  551656422UL,
+    220703990UL,  290673507UL,  813436238UL,  911723505UL,  133670724UL,  595246103UL,  794059754UL,  1034884174UL,
+    279812949UL,  1000392115UL, 741301641UL,  33110498UL,   612002112UL,  483344830UL,  267531071UL,  104482069UL,
+    726725733UL,  728751470UL,  128120969UL,  715934270UL,  85695056UL,   510722088UL,  368368782UL,  1056459882UL,
+    94958901UL,   857032782UL,  389106243UL,  176931756UL,  638566105UL,  1000697986UL, 228570346UL,  831943624UL,
+    896797866UL,  411963559UL,  618987703UL,  987535032UL,  698130340UL,  955237568UL,  115003672UL,  131883164UL,
+    303227095UL,  1050531115UL, 725437321UL,  567492116UL,  232321957UL,  644092399UL,  1050223386UL, 17617969UL,
+    625649192UL,  92934069UL,   1068544891UL, 210954411UL,  388904066UL,  757997115UL,  803922088UL,  407834816UL,
+    473716087UL,  695256590UL,  947465580UL,  623366433UL,  202099343UL,  321061961UL,  400889960UL,  672484679UL,
+    184915965UL,  1069248697UL, 300868129UL,  55008968UL,   378808572UL,  140255926UL,  606335597UL,  764863552UL,
+    991734604UL,  1024282375UL, 56527446UL,   86617056UL,   524033817UL,  1047132641UL, 50597449UL,   655217463UL,
+    144823249UL,  626305165UL,  969994743UL,  840358499UL,  598218459UL,  964762702UL,  195870624UL,  1031558599UL,
+    5746749UL,    204883112UL,  36936499UL,   264372726UL,  373520258UL,  238395280UL,  295661957UL,  358165345UL,
+    350309859UL,  894830672UL,  488279803UL,  197391166UL,  124964874UL,  1062409261UL, 692222100UL,  730652416UL,
+    1047406718UL, 928260202UL,  492773608UL,  204942992UL,  411009634UL,  370364232UL,  690503916UL,  559220427UL,
+    132472393UL,  279817842UL,  485335995UL,  91502168UL,   69397199UL,   415939747UL,  380214506UL,  586144328UL,
+    324672446UL,  256477236UL,  576366725UL,  1060625472UL, 185912578UL,  201819229UL,  576437386UL,  482423484UL,
+    1042990570UL, 683417328UL,  809277648UL,  928782334UL,  654403278UL,  868239976UL,  113005447UL,  125813248UL,
+    679944777UL,  330731139UL,  164974000UL,  344150876UL,  996380188UL,  25993602UL,   427064474UL,  795031679UL,
+    42775342UL,   851593247UL,  391991611UL,  453903434UL,  686345414UL,  740918339UL,  687611740UL,  976196262UL,
+    803382622UL,  789682126UL,  723291251UL,  356210634UL,  230420468UL,  573127691UL,  1000081448UL, 569437572UL,
+    1069202460UL, 752503399UL,  250107938UL,  589287978UL,  237954005UL,  181580185UL,  75135162UL,   293013244UL,
+    622388492UL,  738972183UL,  481081681UL,  115478918UL,  12035216UL,   52029907UL,   973155142UL,  9389164UL,
+    434700115UL,  560690185UL,  850402697UL,  585254357UL,  95720548UL,   931682484UL,  214249058UL,  700049255UL,
+    225981052UL,  654651313UL,  474874971UL,  734186566UL,  240267908UL,  315791811UL,  780424763UL,  202559812UL,
+    906333077UL,  649904501UL,  392350223UL,  201296491UL,  591142456UL,  976571174UL,  925674480UL,  413162088UL,
+    648493439UL,  170674814UL,  714210692UL,  741007728UL,  481649830UL,  979326387UL,  966799529UL,  143407816UL,
+};
+uint32_t rand_arr_31_b31_w32_arr[1024] = {
+    1612808007UL, 1691887037UL, 1455419493UL, 1966764212UL, 1883660927UL, 192442549UL,  1861484460UL, 1537787599UL,
+    105858394UL,  299639464UL,  1508816190UL, 649247381UL,  343672955UL,  670067030UL,  1808598698UL, 586121162UL,
+    379735248UL,  581621631UL,  1650718608UL, 1594148856UL, 1223083229UL, 135301852UL,  965070095UL,  1104111188UL,
+    220707783UL,  1112830038UL, 1912401956UL, 299571346UL,  1054305859UL, 152136480UL,  1241045171UL, 5058698UL,
+    1419952067UL, 1085336108UL, 164816960UL,  1686053860UL, 971536630UL,  1759241967UL, 1824612883UL, 1904929044UL,
+    1600125537UL, 277273654UL,  1915086218UL, 1208746981UL, 1323184045UL, 1274213662UL, 1520209314UL, 722919513UL,
+    609098618UL,  1591823260UL, 1744139581UL, 1790068503UL, 1152770489UL, 941673583UL,  1386251291UL, 946679103UL,
+    263214338UL,  1256757644UL, 1285357148UL, 2045671719UL, 1765152568UL, 659597194UL,  1807511849UL, 1267999206UL,
+    196181059UL,  148046338UL,  1794635985UL, 728786088UL,  1339157670UL, 1538174817UL, 1861745874UL, 1501323237UL,
+    39943587UL,   1283410150UL, 930075693UL,  1487907741UL, 190371016UL,  565164436UL,  287718342UL,  1993155301UL,
+    880867312UL,  636933677UL,  1851571592UL, 1576444176UL, 562910185UL,  187083712UL,  1737671127UL, 1190153609UL,
+    123594846UL,  795010638UL,  1175908972UL, 537968232UL,  1750532944UL, 1018738694UL, 2034526959UL, 2010054279UL,
+    1789749219UL, 1780445239UL, 272242045UL,  1181612372UL, 631429935UL,  940166758UL,  767761913UL,  1948663166UL,
+    1147660365UL, 1927459028UL, 304731386UL,  116014416UL,  2060986333UL, 1881882423UL, 1499991829UL, 1792440088UL,
+    664590731UL,  495796346UL,  1199677258UL, 764311629UL,  877326390UL,  907255330UL,  381760285UL,  65971300UL,
+    678910963UL,  2031806465UL, 391059933UL,  1327489693UL, 635929879UL,  959008942UL,  461435918UL,  1232435170UL,
+    1677471571UL, 1724258468UL, 850395165UL,  1730542866UL, 1934201562UL, 1457533999UL, 1455256309UL, 1990662328UL,
+    865736578UL,  389941641UL,  425207024UL,  1299687825UL, 1914094508UL, 409938592UL,  537711039UL,  818039512UL,
+    1250332718UL, 1127200437UL, 350571424UL,  910988689UL,  335377102UL,  669981148UL,  552924886UL,  1710733585UL,
+    1360362468UL, 928741501UL,  302289291UL,  311640368UL,  1249436873UL, 1504466906UL, 1307075542UL, 108407733UL,
+    67574577UL,   1698896539UL, 630519532UL,  1798764912UL, 167060836UL,  1509808701UL, 1547535187UL, 86138592UL,
+    1074146621UL, 1147136913UL, 199037492UL,  1372027698UL, 235856267UL,  1847903411UL, 1349813994UL, 523949033UL,
+    415061229UL,  1397994382UL, 1126909371UL, 778262933UL,  386646210UL,  429244944UL,  1318055907UL, 366157433UL,
+    532192435UL,  1406771584UL, 788318948UL,  1751525043UL, 1784140780UL, 1272966943UL, 2142768045UL, 1817442453UL,
+    642100633UL,  1573909712UL, 346120085UL,  1329363330UL, 823148480UL,  1283580360UL, 891979796UL,  1415006677UL,
+    1490194801UL, 772241134UL,  32809332UL,   84702476UL,   1282793708UL, 339151214UL,  769662183UL,  2127917902UL,
+    826011071UL,  781586079UL,  1334561245UL, 1496071140UL, 835240949UL,  204973622UL,  1334502124UL, 873872270UL,
+    1983692465UL, 527092884UL,  1789171930UL, 1840743186UL, 327737071UL,  1829672699UL, 1836474541UL, 588990841UL,
+    1570594926UL, 1828363583UL, 712409198UL,  993485628UL,  168121380UL,  1268817720UL, 1679191570UL, 1550693288UL,
+    1106712713UL, 928517765UL,  578108338UL,  1255650965UL, 189817213UL,  162247603UL,  611599175UL,  1810170918UL,
+    834275687UL,  696544009UL,  1707248259UL, 1528464741UL, 118732212UL,  1548676007UL, 1660899730UL, 612092984UL,
+    1999366910UL, 1073031643UL, 1752456872UL, 176337574UL,  1778431924UL, 1255412477UL, 1102513419UL, 1117504556UL,
+    1528657778UL, 307747810UL,  997871878UL,  1986130348UL, 805779550UL,  1165766218UL, 528676774UL,  1481387626UL,
+    2022751244UL, 1660048751UL, 560352448UL,  107158437UL,  1144453189UL, 1179236794UL, 1368218381UL, 142354336UL,
+    1496739513UL, 1539790458UL, 631678907UL,  2011690044UL, 145066752UL,  1932845824UL, 1847109314UL, 589402589UL,
+    1732739856UL, 301659807UL,  493371838UL,  2070310179UL, 412762742UL,  2035733195UL, 1768398178UL, 2137901643UL,
+    1492143138UL, 1837379337UL, 1494961211UL, 207869448UL,  148873704UL,  1569671154UL, 2127855145UL, 1413527927UL,
+    1829828486UL, 1359345329UL, 391162127UL,  1588709243UL, 1788348785UL, 173588423UL,  1806796230UL, 443165338UL,
+    1825770545UL, 899103501UL,  1316289421UL, 311074093UL,  1230600642UL, 994553696UL,  1752825271UL, 1833155421UL,
+    1424042098UL, 1946292892UL, 1079898685UL, 1692980987UL, 116697758UL,  819088577UL,  522869807UL,  1055887401UL,
+    1848736262UL, 137632504UL,  430779231UL,  533120814UL,  1176984378UL, 1464796877UL, 2087840924UL, 496397336UL,
+    937047585UL,  138187457UL,  148469583UL,  467495491UL,  343876048UL,  773050455UL,  585086895UL,  1260821676UL,
+    1714960550UL, 1216626616UL, 1438135573UL, 410057408UL,  555182614UL,  1345884643UL, 1030045722UL, 491068580UL,
+    1846757216UL, 1395817203UL, 1100139667UL, 89697563UL,   250336990UL,  1991516863UL, 1015993891UL, 1973849869UL,
+    1575101764UL, 1612542106UL, 970774958UL,  191714489UL,  2106339416UL, 432753525UL,  389783476UL,  735798530UL,
+    230043321UL,  745537998UL,  1013096494UL, 1681147704UL, 555493452UL,  90319337UL,   712795430UL,  54277393UL,
+    1904527811UL, 1706480007UL, 1880370873UL, 1627327343UL, 730831010UL,  2095634430UL, 226817430UL,  1572420634UL,
+    789807604UL,  1996264236UL, 1826766711UL, 317087671UL,  1574862333UL, 1554912430UL, 138195901UL,  24534198UL,
+    1668497379UL, 1785095129UL, 1067906302UL, 1454024519UL, 1318668570UL, 325198581UL,  1970623853UL, 884115652UL,
+    1933347531UL, 1591195937UL, 1169306894UL, 1620188922UL, 2051117163UL, 1119162584UL, 1771607606UL, 1743171396UL,
+    94849657UL,   305093282UL,  1959540923UL, 856348436UL,  234947950UL,  672426070UL,  787923068UL,  1676629986UL,
+    545736550UL,  1343035931UL, 157646843UL,  959446297UL,  219071903UL,  1503670409UL, 910466245UL,  1356338868UL,
+    676647150UL,  1408022793UL, 693308817UL,  430994169UL,  1340207154UL, 607936393UL,  1774650141UL, 254634933UL,
+    959148509UL,  1826824704UL, 1966892595UL, 1630978422UL, 1300513959UL, 496117262UL,  1287569243UL, 1288476307UL,
+    2047565780UL, 487213340UL,  1935653497UL, 1235785737UL, 1458576658UL, 1622123367UL, 1481855599UL, 922575459UL,
+    676169159UL,  1966613839UL, 12061551UL,   276944544UL,  1326311650UL, 973293370UL,  816475480UL,  1814397175UL,
+    1095001033UL, 821461639UL,  575061253UL,  983737862UL,  1904940700UL, 123620810UL,  1296542475UL, 82774829UL,
+    1364608329UL, 371261945UL,  93828054UL,   1169730413UL, 42349028UL,   129879UL,     1042503610UL, 393475407UL,
+    1078587256UL, 2064806569UL, 929268876UL,  1049141090UL, 1353861140UL, 963727069UL,  502282988UL,  2077568537UL,
+    1905651303UL, 772229630UL,  717736283UL,  1055045505UL, 305485934UL,  874110114UL,  1496498504UL, 1692086046UL,
+    82891939UL,   946027456UL,  1973088721UL, 1221990653UL, 2047046102UL, 374991151UL,  561214313UL,  511768656UL,
+    1775454053UL, 286916097UL,  310936113UL,  1947114909UL, 1786387958UL, 80699202UL,   1377693198UL, 70021086UL,
+    1550131849UL, 1828727597UL, 475147433UL,  1529604303UL, 2106761541UL, 317895653UL,  1226222872UL, 1028774033UL,
+    1553280493UL, 2023181498UL, 864780053UL,  2123493207UL, 1665798666UL, 886509171UL,  1375418775UL, 1063235484UL,
+    409839588UL,  386110854UL,  1387523190UL, 1799400248UL, 1758107907UL, 1760166115UL, 1265649675UL, 1381882089UL,
+    1230369275UL, 1730990344UL, 263120437UL,  517090127UL,  1513462001UL, 828635173UL,  1950600473UL, 1816678197UL,
+    349131871UL,  1957177024UL, 1034000045UL, 1676520477UL, 466348079UL,  104157341UL,  1202774544UL, 2027533422UL,
+    164698659UL,  983313273UL,  1817711826UL, 2130031924UL, 1816833522UL, 619987307UL,  1371007008UL, 1730627786UL,
+    1890520101UL, 1169398322UL, 457833741UL,  1407776474UL, 2103270746UL, 1991010007UL, 943282442UL,  805587642UL,
+    214779559UL,  732369964UL,  37878037UL,   1583320772UL, 334796146UL,  972380615UL,  339593372UL,  505677419UL,
+    1224798488UL, 1366504300UL, 1338438009UL, 1395641695UL, 851450496UL,  617909421UL,  614592313UL,  29286042UL,
+    424479700UL,  1888411572UL, 175196653UL,  900946870UL,  950260822UL,  510558695UL,  1388339525UL, 324776164UL,
+    1512123119UL, 1688501525UL, 1774646296UL, 102432228UL,  476291276UL,  249775425UL,  2066467561UL, 1495997407UL,
+    1594493264UL, 1687550881UL, 903906838UL,  344516687UL,  1723293898UL, 722884849UL,  702172232UL,  530078244UL,
+    55969331UL,   1910435947UL, 2143863265UL, 1985006047UL, 1320461329UL, 1386633474UL, 1203403607UL, 321932587UL,
+    1187301189UL, 469777435UL,  1526784322UL, 1148301916UL, 582703950UL,  1758097319UL, 844783388UL,  263255314UL,
+    513608284UL,  324418196UL,  575129294UL,  1664579937UL, 254562660UL,  1537486209UL, 1816473400UL, 1229170942UL,
+    1652350119UL, 915256019UL,  1660242799UL, 607680577UL,  2010386657UL, 1638380074UL, 584411300UL,  591050610UL,
+    1899579930UL, 2022228535UL, 523752361UL,  1040703782UL, 603066276UL,  585685818UL,  1416019645UL, 1968016281UL,
+    1372443535UL, 187795695UL,  269749472UL,  1911269163UL, 1961073109UL, 1866138756UL, 37045255UL,   1553076672UL,
+    1012163198UL, 1516865742UL, 1127190161UL, 551906068UL,  1291246723UL, 1175488466UL, 849763279UL,  1043366561UL,
+    1446695817UL, 617683473UL,  505272715UL,  1330350746UL, 316701590UL,  1131297921UL, 1631421948UL, 639650687UL,
+    2061128444UL, 1145816188UL, 100771741UL,  1407208680UL, 1543803251UL, 285502522UL,  1390967010UL, 2043520546UL,
+    1352383781UL, 1161115991UL, 1265622356UL, 1092949808UL, 751364194UL,  1263948886UL, 1024757387UL, 932771219UL,
+    23139700UL,   463073748UL,  5184496UL,    1250514900UL, 1997377550UL, 1346000163UL, 1545063581UL, 659849980UL,
+    1373993947UL, 1927919129UL, 1288754728UL, 1597748876UL, 433666348UL,  1221141986UL, 224272405UL,  354754971UL,
+    1550522962UL, 1516068631UL, 214091961UL,  1633979418UL, 1095209561UL, 1861741502UL, 1366025315UL, 1920823546UL,
+    228642901UL,  676109692UL,  990689008UL,  304483108UL,  954323196UL,  1442802578UL, 1624615719UL, 1551700660UL,
+    2090884992UL, 981264112UL,  500996749UL,  593809923UL,  588349820UL,  1000667139UL, 1100691700UL, 1032866866UL,
+    1815273338UL, 160553025UL,  819742837UL,  1859072228UL, 1321908321UL, 1216015094UL, 916589096UL,  1302352384UL,
+    236504239UL,  431777951UL,  1056450508UL, 681363028UL,  1069602703UL, 1608287598UL, 1884001384UL, 283464370UL,
+    622649864UL,  21646135UL,   1715174046UL, 2122598840UL, 1692714072UL, 461499248UL,  1447684567UL, 1162183451UL,
+    2079238225UL, 968074484UL,  1099301693UL, 1989812335UL, 51186514UL,   1073196181UL, 609990389UL,  872179517UL,
+    1333860446UL, 656166839UL,  806927989UL,  477210661UL,  1471848699UL, 1407197937UL, 273993333UL,  646139022UL,
+    2137200107UL, 754518955UL,  1500966003UL, 283605749UL,  1899985162UL, 2133897988UL, 1872286540UL, 1951468767UL,
+    1887187208UL, 932464586UL,  590630387UL,  1373993434UL, 46407154UL,   831241212UL,  1188583631UL, 2066697132UL,
+    179836775UL,  1094085900UL, 1732673239UL, 1156533481UL, 489463014UL,  312224493UL,  1329937351UL, 723699573UL,
+    1798675200UL, 183990040UL,  1251263220UL, 115129320UL,  638251387UL,  1999103546UL, 1738896650UL, 1869935054UL,
+    279081111UL,  960794214UL,  479021439UL,  772898064UL,  491538146UL,  2043464305UL, 384443262UL,  440406046UL,
+    635710374UL,  2118111519UL, 691860443UL,  697536099UL,  273472541UL,  1303402274UL, 1339841914UL, 485899814UL,
+    1792484192UL, 1882246945UL, 1448599621UL, 784220140UL,  423913107UL,  683674233UL,  902988884UL,  1799494766UL,
+    1180569941UL, 1883030725UL, 1666472721UL, 1535121450UL, 763025322UL,  1909122905UL, 1730076590UL, 1645566786UL,
+    1732962419UL, 1194802761UL, 329285196UL,  1477563300UL, 462930729UL,  982117946UL,  36850343UL,   1428622227UL,
+    1179023437UL, 2015481041UL, 1255278896UL, 668852287UL,  308949328UL,  299065081UL,  280974678UL,  130195112UL,
+    1741228766UL, 921255407UL,  906839914UL,  705112797UL,  1298500545UL, 177747710UL,  978861083UL,  1861713065UL,
+    1230775452UL, 2100033178UL, 1084293885UL, 1829417020UL, 726022105UL,  848143373UL,  1640195872UL, 1755989086UL,
+    1656490792UL, 1561425000UL, 1586219010UL, 402733538UL,  1385058554UL, 2127470136UL, 618436873UL,  1226201059UL,
+    791914369UL,  2029866088UL, 1075885686UL, 873999338UL,  320233546UL,  35699648UL,   791241616UL,  622693742UL,
+    1072900642UL, 1768209354UL, 2096890446UL, 1674266917UL, 1876674520UL, 522856294UL,  1632300008UL, 884880651UL,
+    1588042348UL, 1743698435UL, 138325285UL,  1295753203UL, 195892170UL,  1561840474UL, 605696559UL,  563538897UL,
+    61628362UL,   381709131UL,  67781483UL,   1193955695UL, 1050360710UL, 801658220UL,  2113426738UL, 222748215UL,
+    253432624UL,  1714129754UL, 1939224522UL, 1526913383UL, 173798552UL,  1860364544UL, 808761195UL,  101718058UL,
+    1258335298UL, 1457153969UL, 873148885UL,  891173915UL,  1114325162UL, 1830139008UL, 716598709UL,  945450715UL,
+    1025832400UL, 1185071458UL, 1667568781UL, 165847414UL,  60457006UL,   1459658113UL, 589274569UL,  406553339UL,
+    1138482582UL, 905344997UL,  17182879UL,   192837784UL,  489502791UL,  674420555UL,  779440429UL,  1179538810UL,
+    1333707776UL, 876078877UL,  1221403389UL, 626816865UL,  198565746UL,  2035292662UL, 505028139UL,  711282623UL,
+    1881151829UL, 1157612037UL, 1583154879UL, 1603484674UL, 959912634UL,  1633461949UL, 1642654306UL, 2063430022UL,
+    795424346UL,  770660476UL,  554104339UL,  1265114077UL, 1939812444UL, 1898362685UL, 788351905UL,  1787081495UL,
+    1608310565UL, 1081767808UL, 1499566527UL, 229648248UL,  928607273UL,  1809252835UL, 1597905638UL, 1880456561UL,
+    223898408UL,  885268919UL,  1595748786UL, 936957502UL,  1755781189UL, 1494465784UL, 1604718242UL, 230239807UL,
+    124046565UL,  233081669UL,  1450368920UL, 1267306881UL, 110674856UL,  9883324UL,    2123069813UL, 1305054865UL,
+    1909786306UL, 507472599UL,  1399274365UL, 171305889UL,  1003213707UL, 147840192UL,  129957860UL,  101337389UL,
+    1716995862UL, 1314029276UL, 2142733270UL, 788957327UL,  514247643UL,  208818187UL,  210242704UL,  1899671877UL,
+    106423856UL,  140765997UL,  226374094UL,  932923568UL,  1052940067UL, 502702427UL,  304295904UL,  1457266689UL,
+    1268613384UL, 2015074981UL, 818626169UL,  265037310UL,  1321876113UL, 866233429UL,  652729809UL,  1818577324UL,
+    1291854855UL, 1841846567UL, 1879872541UL, 1661203532UL, 2106088293UL, 479081227UL,  1208211022UL, 324854129UL,
+    1854983415UL, 1610704652UL, 521375457UL,  1534794683UL, 1746500067UL, 158331967UL,  2082274243UL, 1955654931UL,
+    1298539569UL, 439270110UL,  814919493UL,  1698382380UL, 999906316UL,  2003416678UL, 1452023358UL, 2034726903UL,
+};
+uint32_t rand_arr_32_b32_w32_arr[1024] = {
+    3684824285UL, 620044519UL,  3069168923UL, 621807873UL,  3358392258UL, 3717104267UL, 1358613032UL, 2103133469UL,
+    1142633421UL, 3346206526UL, 145563661UL,  4071397363UL, 2049190964UL, 2915317229UL, 2211544802UL, 3160663235UL,
+    2336366644UL, 3960677787UL, 2305780937UL, 159290079UL,  1794549562UL, 1478749805UL, 579589788UL,  3648028378UL,
+    1693657212UL, 609540670UL,  3724930767UL, 3314819346UL, 476799883UL,  2250810039UL, 2464872386UL, 140386709UL,
+    2914254120UL, 3509170270UL, 853245165UL,  2599979009UL, 1473374739UL, 681502223UL,  375744189UL,  2058566977UL,
+    348618825UL,  2993709639UL, 2714727642UL, 155550823UL,  3959029220UL, 2423172299UL, 2903141960UL, 3548087390UL,
+    3546628918UL, 3636598085UL, 4185696848UL, 1893308815UL, 3331154478UL, 2112689440UL, 2792649999UL, 2162753692UL,
+    2664549658UL, 2205687448UL, 2740637009UL, 1452899513UL, 3265631129UL, 2883258232UL, 1780824843UL, 2817141391UL,
+    4128576504UL, 2466684968UL, 3054764126UL, 2281561970UL, 3745535158UL, 219827253UL,  3656905087UL, 169655504UL,
+    3399363649UL, 3948461569UL, 2858232622UL, 1882652589UL, 3772975973UL, 275090761UL,  3961659418UL, 1430936071UL,
+    1056099866UL, 13493015UL,   4175411159UL, 3101218338UL, 3292494976UL, 1393043180UL, 2404503654UL, 3801141309UL,
+    22562540UL,   816612073UL,  1979817946UL, 4200729692UL, 3728897216UL, 2344995494UL, 3203687407UL, 1606906741UL,
+    3443394458UL, 2188321130UL, 2633100004UL, 1197348666UL, 3720322545UL, 1819210088UL, 4021808657UL, 209201406UL,
+    4141778986UL, 3199811232UL, 754213780UL,  3500176011UL, 2445046517UL, 4022819440UL, 400560998UL,  615896258UL,
+    1705220333UL, 1795269894UL, 3104555521UL, 3669148372UL, 318599902UL,  2753919971UL, 2796673365UL, 2956267954UL,
+    2845822115UL, 1616578025UL, 1009375139UL, 3429240474UL, 1222258346UL, 2495243571UL, 1543797634UL, 1768362354UL,
+    2989321508UL, 2498370093UL, 2131985703UL, 1548591092UL, 2851930194UL, 813782149UL,  1070336815UL, 484035143UL,
+    3766876195UL, 3878962993UL, 2848668233UL, 2461178188UL, 112997374UL,  4142124184UL, 3565789927UL, 3473694715UL,
+    1607209548UL, 3622434512UL, 1951295767UL, 3113159747UL, 660373910UL,  3596119389UL, 1798997UL,    2018116912UL,
+    4016875259UL, 3175774476UL, 521904309UL,  3398886174UL, 2572517611UL, 3722772353UL, 623065906UL,  1725237117UL,
+    2470558609UL, 2134474456UL, 854452172UL,  1100763494UL, 403439522UL,  2411092341UL, 3052383624UL, 1310723803UL,
+    3947223293UL, 3217002929UL, 2259091957UL, 2803568350UL, 398045291UL,  751021061UL,  1866248685UL, 1062049926UL,
+    2172784818UL, 2524116587UL, 2349694782UL, 2742516302UL, 68039843UL,   3928502871UL, 2288751770UL, 886249226UL,
+    1338054491UL, 3854932091UL, 1144285182UL, 2220250033UL, 2239122443UL, 1083636592UL, 1337828736UL, 2070392081UL,
+    2668997589UL, 1819954964UL, 3124248865UL, 3307079725UL, 2792476153UL, 3083840021UL, 3648960298UL, 2512727243UL,
+    3750868236UL, 2950960618UL, 1922879364UL, 2636868133UL, 2352003827UL, 3177589230UL, 3341268583UL, 2624773993UL,
+    572554776UL,  1671394213UL, 2477592387UL, 3723971516UL, 2307576595UL, 3349422922UL, 833555463UL,  2909287593UL,
+    1677226822UL, 2363106661UL, 308064310UL,  3515189996UL, 68940399UL,   1838137401UL, 1893055557UL, 906363720UL,
+    1622036111UL, 1261057244UL, 828522611UL,  1190800794UL, 78278201UL,   4094620494UL, 3547148893UL, 595558122UL,
+    854194104UL,  3271640058UL, 2505108686UL, 3742811427UL, 509662841UL,  993996131UL,  2844459735UL, 867210163UL,
+    25722339UL,   1994270130UL, 2439690348UL, 1142191190UL, 4097350855UL, 105249836UL,  3582445893UL, 698839145UL,
+    3283959334UL, 3433554514UL, 1896889539UL, 2359528783UL, 1996877808UL, 4068267243UL, 1919074961UL, 4138086514UL,
+    1870368132UL, 2217048236UL, 2273735219UL, 369859005UL,  1929320473UL, 1932332168UL, 3989644637UL, 345723458UL,
+    2904998459UL, 3496618040UL, 3027244360UL, 1083738951UL, 1599962703UL, 393308402UL,  3213707616UL, 110178971UL,
+    3629412535UL, 2720780970UL, 2271341870UL, 739336367UL,  738961555UL,  1983965966UL, 499719522UL,  72653107UL,
+    3854952521UL, 1411208640UL, 3036444882UL, 841482615UL,  1520206392UL, 758500552UL,  143192998UL,  1516289184UL,
+    1361457223UL, 3347292011UL, 1935834919UL, 1993782720UL, 1218425348UL, 2727594374UL, 2547864278UL, 2098423510UL,
+    3417188990UL, 4037696155UL, 1509099023UL, 3639981663UL, 226078116UL,  303677229UL,  1435281128UL, 418444416UL,
+    4049365224UL, 2051412080UL, 1689523371UL, 2313732973UL, 2323014163UL, 148761157UL,  3905322298UL, 2575909842UL,
+    3648114391UL, 1754696917UL, 2954957583UL, 719564305UL,  4082486262UL, 4037147122UL, 2044615131UL, 2711842264UL,
+    2841900223UL, 2795667841UL, 2885728887UL, 456393390UL,  1299724735UL, 1234222991UL, 3931910001UL, 2238812228UL,
+    3387127463UL, 1890285455UL, 118334963UL,  1844942408UL, 3499794680UL, 4101998963UL, 3376473081UL, 3033165067UL,
+    3414122639UL, 3066941752UL, 2227589278UL, 2221335437UL, 2370964474UL, 2168256358UL, 1564994027UL, 2357130953UL,
+    3659733522UL, 3503320958UL, 421363016UL,  897013260UL,  2254633959UL, 250803260UL,  1541254675UL, 1851790583UL,
+    1268464302UL, 2110802320UL, 102117410UL,  1783614384UL, 518311974UL,  950929762UL,  1391577168UL, 3848127204UL,
+    922746812UL,  48192597UL,   1581257675UL, 993718799UL,  364336198UL,  265335679UL,  1851712208UL, 661285506UL,
+    1871681346UL, 1692715194UL, 2775077360UL, 2335390999UL, 953496412UL,  3901910838UL, 700692845UL,  1839036011UL,
+    2245699985UL, 382185075UL,  1944825927UL, 280873023UL,  1126380012UL, 3311774473UL, 663427978UL,  1753257377UL,
+    1466347935UL, 758668136UL,  1919294597UL, 2632602691UL, 3074509965UL, 1874486036UL, 3477402647UL, 3464660319UL,
+    1034159720UL, 3379877491UL, 2791346444UL, 807678763UL,  1250367736UL, 4010095056UL, 3425695169UL, 3617517943UL,
+    889239697UL,  3521708936UL, 1963936412UL, 3202301671UL, 3973508171UL, 853526954UL,  1630155047UL, 1341559510UL,
+    3397808831UL, 581978273UL,  2590791535UL, 3350311411UL, 1165123325UL, 2180206959UL, 3943022443UL, 2848658188UL,
+    2788404573UL, 1060483910UL, 810286899UL,  4111732889UL, 1489509289UL, 3783206939UL, 3163964885UL, 1051474779UL,
+    3879338097UL, 3340967472UL, 2491711763UL, 2302879035UL, 383460776UL,  527453199UL,  2052068778UL, 1820165580UL,
+    3940602323UL, 3488000638UL, 2386952945UL, 3982380054UL, 606698000UL,  1453410317UL, 137749713UL,  3611190012UL,
+    2400170160UL, 3833181742UL, 174041946UL,  3602401118UL, 2177555343UL, 1100088988UL, 2191666456UL, 3816377593UL,
+    296049942UL,  1290017961UL, 3450630976UL, 3740925130UL, 2427045431UL, 2461263289UL, 618594805UL,  3235906223UL,
+    1093314000UL, 1979012556UL, 4182317707UL, 345046889UL,  1560673075UL, 754747391UL,  1297785936UL, 1789089502UL,
+    1981478521UL, 2517960833UL, 271939148UL,  1365996988UL, 2281196823UL, 1159537611UL, 1957326877UL, 3053106363UL,
+    445064420UL,  2477155047UL, 458927339UL,  2596800922UL, 2797665309UL, 1054471569UL, 2433036690UL, 566998498UL,
+    3422276081UL, 330004803UL,  1307848387UL, 2237838178UL, 97676047UL,   3144291140UL, 2967054694UL, 1908581481UL,
+    3224448918UL, 401536UL,     443357841UL,  2027561994UL, 1537508403UL, 2506987369UL, 1560950148UL, 2875104397UL,
+    1506937053UL, 3798272845UL, 3116048461UL, 160378033UL,  2438718522UL, 736210198UL,  3365439625UL, 2278351154UL,
+    561036801UL,  3330923183UL, 825550636UL,  3959216710UL, 2440948761UL, 2700818580UL, 3320963252UL, 131950600UL,
+    1978649638UL, 1739822582UL, 2304234410UL, 2191343525UL, 3732254872UL, 732908516UL,  147757615UL,  3970302229UL,
+    3506706227UL, 3064803076UL, 3396078719UL, 2517070938UL, 552147049UL,  3642924575UL, 2821979720UL, 2209378946UL,
+    1071427375UL, 3478488438UL, 1842516846UL, 696783850UL,  1119494316UL, 1232126338UL, 4209749012UL, 2391088815UL,
+    3243007983UL, 130781611UL,  2243456025UL, 3970328528UL, 60294498UL,   938388139UL,  1106358634UL, 537365433UL,
+    795021717UL,  1214022556UL, 1778479908UL, 1945108003UL, 3288107712UL, 1836094338UL, 2730055161UL, 255155218UL,
+    2304754298UL, 2267340755UL, 3467451707UL, 1467454055UL, 3610028242UL, 2841200548UL, 1907668169UL, 1339053590UL,
+    3658007451UL, 218249389UL,  421863898UL,  1951235396UL, 3902968307UL, 3809063731UL, 2052337762UL, 1707706338UL,
+    4111629948UL, 490566488UL,  233769921UL,  4080988588UL, 3494067488UL, 777223705UL,  631224848UL,  1221523215UL,
+    1152298979UL, 1244323548UL, 801404825UL,  3678325685UL, 735258311UL,  3707929884UL, 2712511690UL, 2581500804UL,
+    3157129655UL, 3360178598UL, 3151121432UL, 536596967UL,  2486881994UL, 298786624UL,  2582484556UL, 1875348725UL,
+    1595429422UL, 803680316UL,  2027978669UL, 1083920685UL, 3222194420UL, 50444354UL,   673303215UL,  1194446699UL,
+    1754656939UL, 642108059UL,  2134440462UL, 111548800UL,  1514650939UL, 2465242182UL, 2110408580UL, 2356087269UL,
+    3184193134UL, 844662619UL,  941202466UL,  380605128UL,  1296191579UL, 746098630UL,  2130937102UL, 1419682395UL,
+    3496021374UL, 2283905894UL, 1387216191UL, 3460286509UL, 2870406174UL, 4012323145UL, 4124719858UL, 3387760945UL,
+    546715408UL,  3402640430UL, 4153941341UL, 1083917850UL, 1998708103UL, 2659523317UL, 4063192439UL, 423317653UL,
+    1779150119UL, 3895823347UL, 1829850762UL, 3921243758UL, 3421670902UL, 1591463113UL, 1734224101UL, 3395962185UL,
+    3649706167UL, 29610842UL,   1760316165UL, 3112416690UL, 1116170584UL, 3479631662UL, 2991641026UL, 581310683UL,
+    3543999966UL, 1450186498UL, 765779755UL,  187455541UL,  429772808UL,  248661875UL,  1654419085UL, 3669224592UL,
+    2009174313UL, 552701935UL,  3947298461UL, 1885032782UL, 1015823865UL, 1425676677UL, 1671866142UL, 2893616113UL,
+    1441758067UL, 1331712784UL, 2020496122UL, 621865075UL,  2588871810UL, 721600776UL,  4086675029UL, 2795431204UL,
+    493256027UL,  2858765550UL, 2189823827UL, 383431759UL,  3520814914UL, 1885832114UL, 1391748634UL, 2065036614UL,
+    38063911UL,   4047295175UL, 2481178400UL, 2304510549UL, 3027519394UL, 2343838524UL, 4193302731UL, 3877142474UL,
+    3220699401UL, 3685291656UL, 4031155809UL, 3313264602UL, 2874563752UL, 3819519203UL, 2450665353UL, 3158603019UL,
+    40612084UL,   1027570605UL, 3707658851UL, 1600379251UL, 2712862410UL, 840579835UL,  2742386774UL, 259579680UL,
+    689544634UL,  4285481905UL, 4069587006UL, 4102091873UL, 829317477UL,  4028142817UL, 3299542613UL, 3591524386UL,
+    1030112550UL, 78991824UL,   3460998121UL, 1058779325UL, 4245949286UL, 189344881UL,  1251523544UL, 4002850389UL,
+    23182235UL,   334193049UL,  2059888532UL, 785573330UL,  2888533102UL, 444221962UL,  2485039980UL, 1043203360UL,
+    368864150UL,  2245044175UL, 703389995UL,  1364555916UL, 843956823UL,  3216632384UL, 778733917UL,  3542973711UL,
+    1713416001UL, 2325624206UL, 3174572173UL, 999906507UL,  3277781105UL, 421204869UL,  527900297UL,  3014442181UL,
+    4084051691UL, 3999240918UL, 2759281952UL, 2168581413UL, 1475004242UL, 3599493262UL, 2884715323UL, 1502015269UL,
+    3524735952UL, 888134023UL,  1641387944UL, 212257518UL,  2830757078UL, 4194402804UL, 844978862UL,  1736493411UL,
+    4257607377UL, 1004157061UL, 843454319UL,  3146889593UL, 4002795045UL, 2482369320UL, 715766022UL,  1955365617UL,
+    134077171UL,  215501931UL,  4114622456UL, 409517438UL,  3137504238UL, 643266577UL,  3154151625UL, 3261631541UL,
+    44000159UL,   4049025842UL, 212498664UL,  3449826714UL, 240087228UL,  468757969UL,  3518552569UL, 1529471354UL,
+    3935347293UL, 2956646808UL, 3800688841UL, 2106556074UL, 58139260UL,   847323843UL,  2047542460UL, 1772526829UL,
+    94803326UL,   3146417467UL, 2613080044UL, 1187498251UL, 3018450487UL, 1217172033UL, 397244002UL,  3135364648UL,
+    555785399UL,  1922444428UL, 3054940281UL, 3347458224UL, 3716562768UL, 1667866610UL, 2542819329UL, 3049545374UL,
+    1830284373UL, 1154202154UL, 3260323124UL, 474591877UL,  3735307511UL, 3773459816UL, 1087401929UL, 587854919UL,
+    2562048722UL, 2518499054UL, 1410667389UL, 3389635608UL, 500795072UL,  1810515605UL, 3583003003UL, 2320103000UL,
+    4130499470UL, 412619417UL,  4117183625UL, 1783968877UL, 1200359739UL, 2430975719UL, 4121932145UL, 4260126416UL,
+    1803194956UL, 2301003627UL, 4029593813UL, 4147916130UL, 3303717734UL, 675741955UL,  2843078151UL, 2014302621UL,
+    919191171UL,  3283758457UL, 3720907585UL, 2605927183UL, 4287615721UL, 3312845240UL, 4127261775UL, 3867093087UL,
+    2255961494UL, 356440407UL,  4112202014UL, 453864127UL,  1802785313UL, 425165869UL,  1554634572UL, 3274132871UL,
+    1079629919UL, 878084589UL,  1239863576UL, 1672759985UL, 4086079618UL, 2565619622UL, 404121706UL,  3701154789UL,
+    1202725528UL, 4195109930UL, 2780548509UL, 3566607188UL, 3119926985UL, 1678469735UL, 3444949198UL, 3337357871UL,
+    3981360784UL, 1455028603UL, 3875850048UL, 2298753857UL, 762929402UL,  1755337831UL, 1963901145UL, 190045791UL,
+    309096583UL,  4062495363UL, 2437488049UL, 1263482329UL, 1789456593UL, 2397790561UL, 3230633929UL, 1686034101UL,
+    788857323UL,  3901852921UL, 2802787548UL, 972056049UL,  2928383101UL, 1759044950UL, 3013274823UL, 1695365251UL,
+    2604322143UL, 3556094207UL, 544992449UL,  4159165043UL, 1275079608UL, 3681325089UL, 3230909518UL, 999953415UL,
+    3574602572UL, 2284648019UL, 649686234UL,  4249861994UL, 3267329189UL, 3308330527UL, 2017619726UL, 1412328499UL,
+    3458740375UL, 3475180553UL, 3059472297UL, 2832716121UL, 2953884791UL, 3500692874UL, 370379592UL,  2797175696UL,
+    3511160643UL, 541046779UL,  1975601810UL, 940998879UL,  1894131847UL, 3463479237UL, 3424037360UL, 2012625293UL,
+    3314751529UL, 2773804701UL, 3823235405UL, 4240681227UL, 4253002636UL, 2788404819UL, 2835343599UL, 126652504UL,
+    1633161856UL, 3084969156UL, 4068234852UL, 1772405575UL, 691411348UL,  1174719276UL, 3053029780UL, 880201613UL,
+    379737951UL,  209341010UL,  381908984UL,  1728032588UL, 2079760610UL, 2760909682UL, 2950270037UL, 3988322028UL,
+    4111161357UL, 3635707486UL, 49640581UL,   1560007537UL, 3554255922UL, 3335204671UL, 3434369960UL, 2801372955UL,
+    3113030984UL, 3933150210UL, 2257792042UL, 2430956142UL, 3334467611UL, 1992955109UL, 2188078556UL, 1957783367UL,
+    1596555052UL, 3937599423UL, 3204887374UL, 2150381453UL, 1403954794UL, 891115841UL,  2998749924UL, 3392319817UL,
+    3024282190UL, 1360735378UL, 188840901UL,  2477250810UL, 4240395582UL, 1782159212UL, 1112959051UL, 4263911600UL,
+    2668878600UL, 103375198UL,  84961966UL,   4181850860UL, 166574532UL,  522044435UL,  1931951025UL, 3007824392UL,
+    675815567UL,  3638576136UL, 1936493933UL, 1343945911UL, 1280127559UL, 2618589539UL, 3945440097UL, 4028943508UL,
+    3077647472UL, 2406637323UL, 3768562633UL, 2307889353UL, 2344877247UL, 1357905613UL, 2994593956UL, 2323826533UL,
+    3744841075UL, 2224626934UL, 4107833210UL, 3593824410UL, 2198459817UL, 2087378194UL, 1292055606UL, 968837134UL,
+    2799020296UL, 3669721235UL, 660671748UL,  728969127UL,  169216020UL,  159403472UL,  370363967UL,  1499845180UL,
+};
+} // namespace helper
diff --git a/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_src.cu b/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_src.cu
new file mode 100644
index 0000000..e69de29
diff --git a/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_test.cu b/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_test.cu
new file mode 100644
index 0000000..fd18a68
--- /dev/null
+++ b/fastlanes/generated/cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_test.cu
@@ -0,0 +1,366 @@
+// generated!
+#include "cuda_normal_t32_1024_uf1_unpack_helper.hpp"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gtest/gtest.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <fastlanes.cuh>
+class cuda_normal_t32_1024_uf1_unpack : public ::testing::Test {
+
+public:
+	uint64_t  warp_sz {};
+	uint64_t  n_vec {};
+	uint64_t  vec_sz {};
+	uint64_t  n_tup {};
+	uint64_t  v_blc_sz {};
+	uint64_t  n_blc {};
+	uint64_t  n_trd {};
+	uint32_t* d_decoded_arr {nullptr};
+	uint32_t* h_decoded_arr {};
+	uint32_t* packed32;
+	uint32_t* unpacked32;
+	uint32_t* d_encoded_arr;
+
+	void SetUp() override {
+
+		n_tup         = 1024;
+		n_trd         = 32;
+		n_blc         = 1;
+		packed32      = new uint32_t[1024]();
+		unpacked32    = new uint32_t[1024]();
+		h_decoded_arr = new uint32_t[1024]();
+		CUDA_SAFE_CALL(cudaMalloc((void**)&d_decoded_arr, sizeof(uint32_t) * n_tup));
+	}
+	~cuda_normal_t32_1024_uf1_unpack() override {}
+};
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_0_bw_0_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_0_b0_w32_arr, packed32, 0);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 0);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_0_b0_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_1_bw_1_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_1_b1_w32_arr, packed32, 1);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 1);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_1_b1_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_2_bw_2_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_2_b2_w32_arr, packed32, 2);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 2);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_2_b2_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_3_bw_3_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_3_b3_w32_arr, packed32, 3);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 3);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_3_b3_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_4_bw_4_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_4_b4_w32_arr, packed32, 4);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 4);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_4_b4_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_5_bw_5_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_5_b5_w32_arr, packed32, 5);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 5);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_5_b5_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_6_bw_6_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_6_b6_w32_arr, packed32, 6);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 6);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_6_b6_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_7_bw_7_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_7_b7_w32_arr, packed32, 7);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 7);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_7_b7_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_8_bw_8_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_8_b8_w32_arr, packed32, 8);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 8);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_8_b8_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_9_bw_9_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_9_b9_w32_arr, packed32, 9);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 9);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_9_b9_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_10_bw_10_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_10_b10_w32_arr, packed32, 10);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 10);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_10_b10_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_11_bw_11_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_11_b11_w32_arr, packed32, 11);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 11);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_11_b11_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_12_bw_12_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_12_b12_w32_arr, packed32, 12);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 12);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_12_b12_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_13_bw_13_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_13_b13_w32_arr, packed32, 13);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 13);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_13_b13_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_14_bw_14_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_14_b14_w32_arr, packed32, 14);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 14);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_14_b14_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_15_bw_15_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_15_b15_w32_arr, packed32, 15);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 15);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_15_b15_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_16_bw_16_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_16_b16_w32_arr, packed32, 16);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 16);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_16_b16_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_17_bw_17_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_17_b17_w32_arr, packed32, 17);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 17);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_17_b17_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_18_bw_18_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_18_b18_w32_arr, packed32, 18);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 18);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_18_b18_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_19_bw_19_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_19_b19_w32_arr, packed32, 19);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 19);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_19_b19_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_20_bw_20_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_20_b20_w32_arr, packed32, 20);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 20);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_20_b20_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_21_bw_21_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_21_b21_w32_arr, packed32, 21);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 21);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_21_b21_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_22_bw_22_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_22_b22_w32_arr, packed32, 22);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 22);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_22_b22_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_23_bw_23_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_23_b23_w32_arr, packed32, 23);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 23);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_23_b23_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_24_bw_24_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_24_b24_w32_arr, packed32, 24);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 24);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_24_b24_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_25_bw_25_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_25_b25_w32_arr, packed32, 25);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 25);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_25_b25_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_26_bw_26_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_26_b26_w32_arr, packed32, 26);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 26);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_26_b26_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_27_bw_27_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_27_b27_w32_arr, packed32, 27);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 27);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_27_b27_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_28_bw_28_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_28_b28_w32_arr, packed32, 28);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 28);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_28_b28_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_29_bw_29_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_29_b29_w32_arr, packed32, 29);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 29);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_29_b29_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_30_bw_30_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_30_b30_w32_arr, packed32, 30);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 30);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_30_b30_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_31_bw_31_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_31_b31_w32_arr, packed32, 31);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 31);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_31_b31_w32_arr[i], h_decoded_arr[i]);
+	}
+}
+TEST_F(cuda_normal_t32_1024_uf1_unpack, test_32_bw_32_ow_32) {
+
+	generated::pack::fallback::scalar::pack(helper::rand_arr_32_b32_w32_arr, packed32, 32);
+	d_encoded_arr = fastlanes::gpu::load_arr(packed32, 32 * 1024 / 8);
+	unpack_global<<<n_blc, n_trd>>>(d_encoded_arr, d_decoded_arr, 32);
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(uint32_t) * n_tup, cudaMemcpyDeviceToHost));
+	for (int i = 0; i < n_tup; i++) {
+		ASSERT_EQ(helper::rand_arr_32_b32_w32_arr[i], h_decoded_arr[i]);
+	}
+}
diff --git a/fastlanes/generated/cuda/normal_t32_uf1/unpack.cmake b/fastlanes/generated/cuda/normal_t32_uf1/unpack.cmake
new file mode 100644
index 0000000..5e44822
--- /dev/null
+++ b/fastlanes/generated/cuda/normal_t32_uf1/unpack.cmake
@@ -0,0 +1,19 @@
+add_library(cuda_normal_t32_1024_uf1_unpack OBJECT
+        cuda_normal_t32_1024_uf1_unpack_src.cu)
+target_compile_definitions(cuda_normal_t32_1024_uf1_unpack PRIVATE IS_SCALAR)
+
+target_compile_options(cuda_normal_t32_1024_uf1_unpack PUBLIC ${FLAG})
+cmake_print_properties(TARGETS cuda_normal_t32_1024_uf1_unpack
+        PROPERTIES COMPILE_DEFINITIONS
+        PROPERTIES COMPILE_OPTIONS)
+LIST(APPEND FLS_GENERATED_OBJECT_FILES
+        $<TARGET_OBJECTS:cuda_normal_t32_1024_uf1_unpack>)
+get_target_property(TARGET_NAME cuda_normal_t32_1024_uf1_unpack NAME)
+get_target_property(TARGET_COMPILE_OPTIONS cuda_normal_t32_1024_uf1_unpack COMPILE_OPTIONS)
+#------------------------------------------------------------------------------------------------------
+add_executable(cuda_normal_t32_1024_uf1_unpack_test cuda_normal_t32_1024_uf1_unpack_test.cu)
+target_link_libraries(cuda_normal_t32_1024_uf1_unpack_test PRIVATE cuda_normal_t32_1024_uf1_unpack)
+target_link_libraries(cuda_normal_t32_1024_uf1_unpack_test PRIVATE gtest_main fastlanes_gpu)
+#------------------------------------------------------------------------------------------------------
+add_executable(cuda_normal_t32_1024_uf1_unpack_bench cuda_normal_t32_1024_uf1_unpack_bench.cu)
+target_link_libraries(cuda_normal_t32_1024_uf1_unpack_bench PRIVATE cuda_normal_t32_1024_uf1_unpack fastlanes_gpu)
diff --git a/fastlanes/generated/generated_files.txt b/fastlanes/generated/generated_files.txt
new file mode 100644
index 0000000..967fa56
--- /dev/null
+++ b/fastlanes/generated/generated_files.txt
@@ -0,0 +1,8 @@
+cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_src.cu
+cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_helper.hpp
+cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_bench.cu
+cuda/normal_t32_uf1/cuda_normal_t32_1024_uf1_unpack_test.cu
+cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_src.cu
+cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_helper.hpp
+cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_bench.cu
+cuda/fused_t32_uf1/cuda_fused_t32_1024_uf1_unpack_test.cu
diff --git a/fastlanes/generated_files.txt b/fastlanes/generated_files.txt
new file mode 100644
index 0000000..8a69db8
--- /dev/null
+++ b/fastlanes/generated_files.txt
@@ -0,0 +1,4 @@
+gpu/cuda_t32_uf1/gpu_cuda_t32_1024_uf1_unpack_src.cu
+gpu/cuda_t32_uf1/gpu_cuda_t32_1024_uf1_unpack_helper.hpp
+gpu/cuda_t32_uf1/gpu_cuda_t32_1024_uf1_unpack_bench.cu
+gpu/cuda_t32_uf1/gpu_cuda_t32_1024_uf1_unpack_test.cu
diff --git a/fastlanes/src/CMakeLists.txt b/fastlanes/src/CMakeLists.txt
new file mode 100644
index 0000000..4a1ff67
--- /dev/null
+++ b/fastlanes/src/CMakeLists.txt
@@ -0,0 +1,57 @@
+add_library(fastlanes_gpu
+        SHARED
+        fastlanes_gpu.cpp
+        pack.cpp
+        transpose.cpp
+        unrsum.cpp)
+
+set_property(TARGET fastlanes_gpu PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+target_include_directories(fastlanes_gpu PUBLIC include crystal crystal-opt)
+
+add_executable(compress_ssb ssb/compress_ssb.cu)
+target_link_libraries(compress_ssb PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(compress_ssb_sorted ssb/compress_ssb_sorted.cu)
+target_link_libraries(compress_ssb_sorted PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q11 ssb/fls_q11.cu)
+target_link_libraries(fls_q11 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q21 ssb/fls_q21.cu)
+target_link_libraries(fls_q21 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q21_bitpacked_opt_v4 ssb/fls_q21_bitpacked_opt_v4.cu)
+target_link_libraries(fls_q21_bitpacked_opt_v4 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q31 ssb/fls_q31.cu)
+target_link_libraries(fls_q31 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q41 ssb/fls_q41.cu)
+target_link_libraries(fls_q41 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q31_bitpacked_opt_v5 ssb/fls_q31_bitpacked_opt_v5.cu)
+target_link_libraries(fls_q31_bitpacked_opt_v5 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q41_bitpacked_opt_v3 ssb/fls_q41_bitpacked_opt_v3.cu)
+target_link_libraries(fls_q41_bitpacked_opt_v3 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q41_bitpacked_opt_v4 ssb/fls_q41_bitpacked_opt_v4.cu)
+target_link_libraries(fls_q41_bitpacked_opt_v4 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q11_bitpacked_opt_v3 ssb/fls_q11_bitpacked_opt_v3.cu)
+target_link_libraries(fls_q11_bitpacked_opt_v3 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(fls_q11_bitpacked_opt_v4 ssb/fls_q11_bitpacked_opt_v4.cu)
+target_link_libraries(fls_q11_bitpacked_opt_v4 PUBLIC fastlanes_gpu gtest_main)
+
+add_executable(bitpack_shared_memory bitpack_shared_memory.cu)
+target_link_libraries(bitpack_shared_memory PRIVATE fastlanes_gpu gtest_main)
+
+add_executable(bitpack_register bitpack_register.cu)
+target_link_libraries(bitpack_register PRIVATE fastlanes_gpu gtest_main)
+
+add_executable(delta_shared_memory delta_shared_memory.cu)
+target_link_libraries(delta_shared_memory PRIVATE fastlanes_gpu gtest_main)
+
+add_executable(delta_global_memory delta_global_memory.cu)
+target_link_libraries(delta_global_memory PRIVATE fastlanes_gpu gtest_main)
\ No newline at end of file
diff --git a/fastlanes/src/bitpack_register.cu b/fastlanes/src/bitpack_register.cu
new file mode 100644
index 0000000..4bc24ce
--- /dev/null
+++ b/fastlanes/src/bitpack_register.cu
@@ -0,0 +1,144 @@
+#include "crystal/crystal.cuh"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack_fused.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+struct QueryMtd {
+	n_t      n_vec;
+	uint8_t  bw;
+	n_t      n_tup;
+	uint64_t result;
+};
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel(const uint32_t* encoded_col, QueryMtd query_mtd, unsigned long long* revenue) {
+	// int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	uint32_t items[ITEMS_PER_THREAD];
+
+	long long sum = 0;
+
+	// int tile_offset    = blockIdx.x * TILE_SIZE;
+	// int num_tiles      = (query_mtd.n_tup + TILE_SIZE - 1) / TILE_SIZE;
+
+
+	int extendedprice_tile_offset = blockIdx.x * query_mtd.bw * 32;
+	unpack_device(encoded_col + extendedprice_tile_offset, items, query_mtd.bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		sum += items[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query_aggregate(const uint32_t* enc_arr, QueryMtd hardcoded, cub::CachingDeviceAllocator& g_allocator) {
+	// int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	SETUP_TIMING();
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+	cudaEventRecord(start, 0);
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<hardcoded.n_vec, BLOCK_THREADS>>>(enc_arr, hardcoded, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != hardcoded.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+n_t bitpacked_vec_n_tup(uint8_t bitdwith) {
+	/**/
+	return bitdwith * 32;
+}
+
+void shared_memory_bitpacking_with_aggregation() {
+
+	constexpr uint64_t n_vec          = 256 * 1024;
+	constexpr uint64_t vec_sz         = 1024;
+	constexpr uint64_t n_tup          = vec_sz * n_vec;
+	auto*              h_org_arr      = new uint32_t[n_tup];
+	auto*              h_encoded_data = new uint32_t[n_tup];
+	size_t             repeat         = 3;
+
+	for (uint8_t bitwidth {0}; bitwidth < 33; bitwidth++) {
+
+		uint32_t bw = bitwidth;
+
+		uint32_t mask            = (1 << bitwidth) - 1;
+		uint64_t encoded_arr_bsz = n_tup * sizeof(int);
+
+		FLS_SHOW(bw)
+		uint64_t sum {0};
+		/* generate random numbers. */
+		for (int i = 0; i < n_tup; i++) {
+			h_org_arr[i] = 5 & mask;
+			sum += h_org_arr[i];
+		}
+		FLS_SHOW(sum)
+
+		auto in  = h_org_arr;
+		auto out = h_encoded_data;
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::pack::fallback::scalar::pack(in, out, bitwidth);
+			in  = in + vec_sz;
+			out = out + (bitwidth * 32);
+		}
+
+		auto* d_encoded_arr = gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, g_allocator);
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+		QueryMtd query_mtd {n_vec, bitwidth, n_tup, sum};
+		for (int i {0}; i < repeat; ++i) {
+			auto time = query_aggregate<32, 32>(d_encoded_arr, query_mtd, g_allocator);
+			FLS_SHOW(time)
+		}
+
+
+		CLEANUP(d_encoded_arr)
+	}
+}
+
+int main() {
+	/**/
+	shared_memory_bitpacking_with_aggregation();
+}
\ No newline at end of file
diff --git a/fastlanes/src/bitpack_shared_memory.cu b/fastlanes/src/bitpack_shared_memory.cu
new file mode 100644
index 0000000..a5d5452
--- /dev/null
+++ b/fastlanes/src/bitpack_shared_memory.cu
@@ -0,0 +1,148 @@
+#include "crystal/crystal.cuh"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+struct QueryMtd {
+	n_t      n_vec;
+	uint8_t  bw;
+	n_t      n_tup;
+	uint64_t result;
+};
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel(const uint32_t* encoded_col, QueryMtd query_mtd, unsigned long long* revenue) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	uint32_t items[ITEMS_PER_THREAD];
+
+	static __shared__ uint32_t unpacked[1024];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup - tile_offset; }
+
+	int extendedprice_tile_offset = blockIdx.x * query_mtd.bw * 32;
+	unpack_device(encoded_col + extendedprice_tile_offset, unpacked, query_mtd.bw);
+
+	BlockLoad<uint32_t, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		sum += items[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query_aggregate(const uint32_t* enc_arr, QueryMtd hardcoded, cub::CachingDeviceAllocator& g_allocator) {
+	// int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	SETUP_TIMING();
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+	cudaEventRecord(start, 0);
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<hardcoded.n_vec, BLOCK_THREADS>>>(enc_arr, hardcoded, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != hardcoded.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+n_t bitpacked_vec_n_tup(uint8_t bitdwith) {
+	/**/
+	return bitdwith * 32;
+}
+
+void shared_memory_bitpacking_with_aggregation() {
+
+	constexpr uint64_t n_vec          = 256 * 1024;
+	constexpr uint64_t vec_sz         = 1024;
+	constexpr uint64_t n_tup          = vec_sz * n_vec;
+	auto*              h_org_arr      = new uint32_t[n_tup];
+	auto*              h_encoded_data = new uint32_t[n_tup];
+	size_t             repeat         = 3;
+
+	for (uint8_t bitwidth {0}; bitwidth < 33; bitwidth++) {
+
+		uint32_t bw = bitwidth;
+
+		uint32_t mask            = (1 << bitwidth) - 1;
+		uint64_t encoded_arr_bsz = n_tup * sizeof(int);
+
+		FLS_SHOW(bw)
+		uint64_t sum {0};
+		/* generate random numbers. */
+		for (int i = 0; i < n_tup; i++) {
+			h_org_arr[i] = 5 & mask;
+			sum += h_org_arr[i];
+		}
+		FLS_SHOW(sum)
+
+		auto in  = h_org_arr;
+		auto out = h_encoded_data;
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::pack::fallback::scalar::pack(in, out, bitwidth);
+			in  = in + vec_sz;
+			out = out + (bitwidth * 32);
+		}
+
+		auto* d_encoded_arr = gpu::load_to_gpu(h_encoded_data, encoded_arr_bsz, g_allocator);
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+		QueryMtd query_mtd {n_vec, bitwidth, n_tup, sum};
+		for (int i {0}; i < repeat; ++i) {
+			auto time = query_aggregate<32, 32>(d_encoded_arr, query_mtd, g_allocator);
+			FLS_SHOW(time)
+		}
+
+		CLEANUP(d_encoded_arr)
+	}
+}
+
+int main() {
+	/**/
+	shared_memory_bitpacking_with_aggregation();
+}
\ No newline at end of file
diff --git a/fastlanes/src/delta_global_memory.cu b/fastlanes/src/delta_global_memory.cu
new file mode 100644
index 0000000..8bb19d9
--- /dev/null
+++ b/fastlanes/src/delta_global_memory.cu
@@ -0,0 +1,160 @@
+#include "crystal/crystal.cuh"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/rsum/rsum.cuh>
+#include <fls_gen/transpose/transpose.hpp>
+#include <fls_gen/unrsum/unrsum.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes;
+using namespace fastlanes::gpu;
+
+struct QueryMtd {
+	n_t      n_vec;
+	uint8_t  bw;
+	n_t      n_tup;
+	uint64_t result;
+};
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+QueryKernel(const uint32_t* base_col, const uint32_t* bitpacked_col, QueryMtd query_mtd, uint32_t* out) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	// uint32_t items[ITEMS_PER_THREAD];
+
+	static __shared__ uint32_t unpacked[1024];
+
+	int tile_offset = blockIdx.x * TILE_SIZE;
+
+	int bitpacked_col_tile_offset = blockIdx.x * query_mtd.bw * 32;
+	unpack_device(bitpacked_col + bitpacked_col_tile_offset, unpacked, query_mtd.bw);
+
+	int based_col_tile_offset = blockIdx.x * 32;
+	d_rsum_32(unpacked, out + tile_offset, base_col + based_col_tile_offset);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float decode(const uint32_t*              org_col,
+             uint32_t*                    decoded_arr,
+             const uint32_t*              base_col,
+             const uint32_t*              bitpacked_col,
+             QueryMtd                     hardcoded,
+             cub::CachingDeviceAllocator& g_allocator) {
+	// int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	SETUP_TIMING();
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st                 = chrono::high_resolution_clock::now();
+	uint32_t* d_result = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result, 1024 * 1024 * 1024));
+	CHECK_ERROR()
+
+	cudaEventRecord(start, 0);
+
+	// Run
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<hardcoded.n_vec, BLOCK_THREADS>>>(base_col, bitpacked_col, hardcoded, d_result);
+	CHECK_ERROR()
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	CubDebugExit(cudaMemcpy(decoded_arr, d_result, 1024 * 1024 * 1024, cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+
+	for (size_t i {0}; i < 256 * 1024 * 1024; ++i) {
+		if (org_col[i] != decoded_arr[i]) { throw std::runtime_error("RESULT INCOREECT!"); }
+	}
+
+	CLEANUP(d_result);
+
+	return time_query;
+}
+
+n_t bitpacked_vec_n_tup(uint8_t bitdwith) {
+	/**/
+	return bitdwith * 32;
+}
+
+void shared_memory_delta_with_aggregation() {
+	size_t         repeat           = 1;
+	const uint64_t n_vec            = 256 * 1024;
+	const uint64_t vec_sz           = 1024;
+	const uint64_t n_tup            = vec_sz * n_vec;
+	const uint64_t n_base           = 32 * n_vec;
+	auto*          h_org_arr        = new uint32_t[n_tup];
+	auto*          h_decoed_arr     = new uint32_t[n_tup];
+	auto*          h_encoded_data   = new uint32_t[n_tup];
+	auto*          h_transposed_arr = new uint32_t[vec_sz];
+	auto*          h_unrsummed_arr  = new uint32_t[vec_sz];
+	auto*          h_base_arr       = new uint32_t[n_base];
+	uint64_t       encoded_arr_bsz  = n_tup * sizeof(int);
+	uint32_t*      d_base_arr       = nullptr;
+	uint32_t*      d_encoded_arr    = nullptr;
+
+	for (uint8_t bitwidth {0}; bitwidth < 33; bitwidth++) {
+		uint32_t bw = bitwidth;
+		uint64_t sum {0};
+
+		/* generate random numbers. */
+		for (int i = 0; i < n_tup; i++) {
+			if (bitwidth < 10) {
+				h_org_arr[i] = bitwidth;
+			}
+		}
+
+		FLS_SHOW(bw)
+
+		auto in_als   = h_org_arr;
+		auto out_als  = h_encoded_data;
+		auto base_als = h_base_arr;
+
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::transpose::fallback::scalar::transpose_i(in_als, h_transposed_arr);
+
+			generated::unrsum::fallback::scalar::unrsum(h_transposed_arr, h_unrsummed_arr);
+
+			std::memcpy(base_als, h_transposed_arr, sizeof(uint32_t) * 32);
+
+			generated::pack::fallback::scalar::pack(h_unrsummed_arr, out_als, bitwidth);
+
+			in_als   = in_als + vec_sz;
+			out_als  = out_als + (bitwidth * 32);
+			base_als = base_als + 32;
+		}
+
+		d_encoded_arr = load_to_gpu(h_encoded_data, encoded_arr_bsz, g_allocator);
+		d_base_arr    = load_to_gpu(h_base_arr, 32 * n_vec * sizeof(uint32_t), g_allocator);
+
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+		QueryMtd query_mtd {n_vec, bitwidth, n_tup, sum};
+		for (int i {0}; i < repeat; ++i) {
+			auto time = decode<32, 32>(h_org_arr, h_decoed_arr, d_base_arr, d_encoded_arr, query_mtd, g_allocator);
+			FLS_SHOW(time)
+		}
+
+		CLEANUP(d_encoded_arr)
+		CLEANUP(d_base_arr)
+	}
+}
+
+int main() {
+	/**/
+	shared_memory_delta_with_aggregation();
+}
\ No newline at end of file
diff --git a/fastlanes/src/delta_shared_memory.cu b/fastlanes/src/delta_shared_memory.cu
new file mode 100644
index 0000000..ae22a99
--- /dev/null
+++ b/fastlanes/src/delta_shared_memory.cu
@@ -0,0 +1,180 @@
+#include "crystal/crystal.cuh"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/rsum/rsum.cuh>
+#include <fls_gen/transpose/transpose.hpp>
+#include <fls_gen/unrsum/unrsum.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes;
+using namespace fastlanes::gpu;
+
+struct QueryMtd {
+	n_t      n_vec;
+	uint8_t  bw;
+	n_t      n_tup;
+	uint64_t result;
+};
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+QueryKernel(const uint32_t* base_col, const uint32_t* bitpacked_col, QueryMtd query_mtd, unsigned long long* revenue) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	uint32_t items[ITEMS_PER_THREAD];
+
+	static __shared__ uint32_t unpacked[1024];
+	static __shared__ uint32_t rsumed[1024];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup - tile_offset; }
+
+	int bitpacked_col_tile_offset = blockIdx.x * query_mtd.bw * 32;
+	unpack_device(bitpacked_col + bitpacked_col_tile_offset, unpacked, query_mtd.bw);
+
+	int based_col_tile_offset = blockIdx.x * 32;
+	d_rsum_32(unpacked, rsumed, base_col + based_col_tile_offset);
+
+	BlockLoad<uint32_t, BLOCK_THREADS, ITEMS_PER_THREAD>(rsumed, items, num_tile_items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		sum += items[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query_aggregate(const uint32_t*              base_col,
+                      const uint32_t*              bitpacked_col,
+                      QueryMtd                     hardcoded,
+                      cub::CachingDeviceAllocator& g_allocator) {
+	// int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	SETUP_TIMING();
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+	cudaEventRecord(start, 0);
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<hardcoded.n_vec, BLOCK_THREADS>>>(base_col, bitpacked_col, hardcoded, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != hardcoded.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+n_t bitpacked_vec_n_tup(uint8_t bitdwith) {
+	/**/
+	return bitdwith * 32;
+}
+
+void shared_memory_delta_with_aggregation() {
+	size_t         repeat           = 1;
+	const uint64_t n_vec            = 256 * 1024;
+	const uint64_t vec_sz           = 1024;
+	const uint64_t n_tup            = vec_sz * n_vec;
+	const uint64_t n_base           = 32 * n_vec;
+	auto*          h_org_arr        = new uint32_t[n_tup];
+	auto*          h_encoded_data   = new uint32_t[n_tup];
+	auto*          h_transposed_arr = new uint32_t[vec_sz];
+	auto*          h_unrsummed_arr  = new uint32_t[vec_sz];
+	auto*          h_base_arr       = new uint32_t[n_base];
+	uint64_t       encoded_arr_bsz  = n_tup * sizeof(int);
+	uint32_t*      d_base_arr       = nullptr;
+	uint32_t*      d_encoded_arr    = nullptr;
+
+	for (uint8_t bitwidth {0}; bitwidth < 33; bitwidth++) {
+		uint32_t bw = bitwidth;
+		uint64_t sum {0};
+
+		/* generate random numbers. */
+		for (int i = 0; i < n_tup; i++) {
+			if (bitwidth < 10) {
+				h_org_arr[i] = bitwidth;
+			} else {
+				h_org_arr[i] = (i % 1024);
+			}
+			sum += h_org_arr[i];
+		}
+
+		FLS_SHOW(sum)
+		FLS_SHOW(bw)
+
+		auto in_als   = h_org_arr;
+		auto out_als  = h_encoded_data;
+		auto base_als = h_base_arr;
+
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::transpose::fallback::scalar::transpose_i(in_als, h_transposed_arr);
+
+			generated::unrsum::fallback::scalar::unrsum(h_transposed_arr, h_unrsummed_arr);
+
+			std::memcpy(base_als, h_transposed_arr, sizeof(uint32_t) * 32);
+
+			generated::pack::fallback::scalar::pack(h_unrsummed_arr, out_als, bitwidth);
+
+			in_als   = in_als + vec_sz;
+			out_als  = out_als + (bitwidth * 32);
+			base_als = base_als + 32;
+		}
+
+		d_encoded_arr = load_to_gpu(h_encoded_data, encoded_arr_bsz, g_allocator);
+		d_base_arr    = load_to_gpu(h_base_arr, 32 * n_vec * sizeof(uint32_t), g_allocator);
+
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+		QueryMtd query_mtd {n_vec, bitwidth, n_tup, sum};
+		for (int i {0}; i < repeat; ++i) {
+			auto time = query_aggregate<32, 32>(d_base_arr, d_encoded_arr, query_mtd, g_allocator);
+			FLS_SHOW(time)
+		}
+
+		CLEANUP(d_encoded_arr)
+		CLEANUP(d_base_arr)
+	}
+}
+
+int main() {
+	/**/
+	shared_memory_delta_with_aggregation();
+}
\ No newline at end of file
diff --git a/fastlanes/src/fastlanes_gpu.cpp b/fastlanes/src/fastlanes_gpu.cpp
new file mode 100644
index 0000000..0fc2aba
--- /dev/null
+++ b/fastlanes/src/fastlanes_gpu.cpp
@@ -0,0 +1,3 @@
+//
+// Created by Azim Afroozeh on 02/05/2024.
+//
diff --git a/fastlanes/src/include/common.cuh b/fastlanes/src/include/common.cuh
new file mode 100644
index 0000000..2a0244b
--- /dev/null
+++ b/fastlanes/src/include/common.cuh
@@ -0,0 +1,12 @@
+#ifndef FLS_GPU_COMMON_CUH
+#define FLS_GPU_COMMON_CUH
+
+#include "debug.hpp"
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+using idx_t = uint64_t;
+using bsz_t = uint64_t;
+using n_t   = uint64_t;
+
+#endif // FLS_GPU_COMMON_CUH
diff --git a/fastlanes/src/include/crystal-opt/crystal.cuh b/fastlanes/src/include/crystal-opt/crystal.cuh
new file mode 100644
index 0000000..8246b3b
--- /dev/null
+++ b/fastlanes/src/include/crystal-opt/crystal.cuh
@@ -0,0 +1,32 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+// Block-wide functions
+#include "load.cuh"
+#include "pred.cuh"
+#include "store.cuh"
+#include "reduce.cuh"
+#include "join.cuh"
+#include "term.cuh"
+
diff --git a/fastlanes/src/include/crystal-opt/join.cuh b/fastlanes/src/include/crystal-opt/join.cuh
new file mode 100644
index 0000000..18ceebb
--- /dev/null
+++ b/fastlanes/src/include/crystal-opt/join.cuh
@@ -0,0 +1,334 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+#include <stdint.h>
+
+#pragma once
+
+#define HASH(X,Y,Z) ((X-Z) % Y)
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(items[ITEM], ht_len, keys_min);
+
+      K slot = ht[hash];
+      if (slot != 0) {
+        selection_flags[ITEM] = 1;
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K slot = ht[hash];
+        if (slot != 0) {
+          selection_flags[ITEM] = 1;
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+      if (slot != 0) {
+        res[ITEM] = (slot >> 32);
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+        if (slot != 0) {
+          res[ITEM] = (slot >> 32);
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash], 0, keys[ITEM]);
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash], 0, items[ITEM]);
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+      ht[(hash << 1) + 1] = res[ITEM];
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+        ht[(hash << 1) + 1] = res[ITEM];
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
diff --git a/fastlanes/src/include/crystal-opt/load.cuh b/fastlanes/src/include/crystal-opt/load.cuh
new file mode 100644
index 0000000..57e0dff
--- /dev/null
+++ b/fastlanes/src/include/crystal-opt/load.cuh
@@ -0,0 +1,147 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLoadDirect(const unsigned int tid,
+                                                    T*                 block_itr,
+                                                    T                  (&items)[ITEMS_PER_THREAD],
+                                                    ST                 (&selection_flags)[ITEMS_PER_THREAD]) {
+	T* thread_itr = block_itr + tid;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (selection_flags[ITEM]) { items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; }
+	}
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLoadDirect(const unsigned int tid,
+                                                    T*                 block_itr,
+                                                    T                  (&items)[ITEMS_PER_THREAD],
+                                                    int                num_items,
+                                                    ST                 (&selection_flags)[ITEMS_PER_THREAD]) {
+	T* thread_itr = block_itr + tid;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (selection_flags[ITEM]) {
+			if (tid + (ITEM * BLOCK_THREADS) < num_items) { items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; }
+		}
+	}
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoad(T* inp, T (&items)[ITEMS_PER_THREAD], int num_items, ST (&selection_flags)[ITEMS_PER_THREAD]) {
+	T* block_itr = inp;
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredLoadDirect<T, ST, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, selection_flags);
+	} else {
+		BlockPredLoadDirect<T, ST, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, block_itr, items, num_items, selection_flags);
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(const unsigned int tid, T* block_itr, T (&items)[ITEMS_PER_THREAD]) {
+	T* thread_itr = block_itr + tid;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockLoadDirect(const unsigned int tid, T* block_itr, T (&items)[ITEMS_PER_THREAD], int num_items) {
+	T* thread_itr = block_itr + tid;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) { items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; }
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(T* inp, T (&items)[ITEMS_PER_THREAD], int num_items) {
+	T* block_itr = inp;
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+	} else {
+		BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+	}
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
diff --git a/fastlanes/src/include/crystal-opt/pred.cuh b/fastlanes/src/include/crystal-opt/pred.cuh
new file mode 100644
index 0000000..9c30a49
--- /dev/null
+++ b/fastlanes/src/include/crystal-opt/pred.cuh
@@ -0,0 +1,459 @@
+#pragma once
+
+#include <cstdint>
+#include <thrust/detail/cstdint.h>
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void InitFlags(int (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = 1;
+	}
+}
+
+template <typename T, typename ST, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredDirect(int tid, T (&items)[ITEMS_PER_THREAD], SelectOp select_op, ST (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = select_op(items[ITEM]);
+	}
+}
+
+template <typename T, typename ST, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(
+    int tid, T (&items)[ITEMS_PER_THREAD], SelectOp select_op, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) { selection_flags[ITEM] = select_op(items[ITEM]); }
+	}
+}
+
+template <typename T, typename ST, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPred(T (&items)[ITEMS_PER_THREAD], SelectOp select_op, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredDirect<T, ST, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredDirect<T, ST, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <typename T, typename ST, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndDirect(int tid, T (&items)[ITEMS_PER_THREAD], SelectOp select_op, ST (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+	}
+}
+
+template <typename T, typename ST, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(
+    int tid, T (&items)[ITEMS_PER_THREAD], SelectOp select_op, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+		}
+	}
+}
+
+template <typename T, typename ST, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAnd(T (&items)[ITEMS_PER_THREAD], SelectOp select_op, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredAndDirect<T, ST, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredAndDirect<T, ST, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredOrDirect(int tid, T (&items)[ITEMS_PER_THREAD], SelectOp select_op, int (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(int      tid,
+                                                  T        (&items)[ITEMS_PER_THREAD],
+                                                  SelectOp select_op,
+                                                  int      (&selection_flags)[ITEMS_PER_THREAD],
+                                                  int      num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+		}
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredOr(T (&items)[ITEMS_PER_THREAD], SelectOp select_op, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <typename T>
+struct LessThan {
+	T compare;
+
+	__device__ __forceinline__ LessThan(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a < compare); }
+};
+
+template <typename T>
+struct GreaterThan {
+	T compare;
+
+	__device__ __forceinline__ GreaterThan(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a > compare); }
+};
+
+template <typename T>
+struct LessThanEq {
+	T compare;
+
+	__device__ __forceinline__ LessThanEq(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a <= compare); }
+};
+
+template <typename T>
+struct GreaterThanEq {
+	T compare;
+
+	__device__ __forceinline__ GreaterThanEq(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a >= compare); }
+};
+
+template <typename T>
+struct Eq {
+	T compare;
+
+	__device__ __forceinline__ Eq(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a == compare); }
+};
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLT(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThan<T> select_op(compare);
+	BlockPred<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndLT(T (&items)[ITEMS_PER_THREAD], T compare, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThan<T> select_op(compare);
+	BlockPredAnd<T, ST, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredGT(T (&items)[ITEMS_PER_THREAD], T compare, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThan<T> select_op(compare);
+	BlockPred<T, ST, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndGT(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThan<T> select_op(compare);
+	BlockPredAnd<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLTE(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThanEq<T> select_op(compare);
+	BlockPred<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndLTE(T (&items)[ITEMS_PER_THREAD], T compare, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThanEq<T> select_op(compare);
+	BlockPredAnd<T, ST, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredGTE(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThanEq<T> select_op(compare);
+	BlockPred<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndGTE(T (&items)[ITEMS_PER_THREAD], T compare, ST (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThanEq<T> select_op(compare);
+	BlockPredAnd<T, ST, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredEQ(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	Eq<T> select_op(compare);
+	BlockPred<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndEQ(T (&items)[ITEMS_PER_THREAD], T compare, T (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	Eq<T> select_op(compare);
+	BlockPredAnd<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredOrEQ(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	Eq<T> select_op(compare);
+	BlockPredOr<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+/* SIMD */
+
+constexpr uint32_t make_simd_const(uint16_t a_compare) {
+	uint32_t compare = 0;
+	compare          = a_compare;
+	compare          = compare << 16;
+	compare          = compare | a_compare;
+	return compare;
+}
+
+/*
+ * GreaterThan_int_16_2
+ */
+
+struct GreaterThan_int_16_2 {
+	uint32_t compare;
+
+	__device__ __forceinline__ GreaterThan_int_16_2(uint32_t a_compare)
+	    : compare(a_compare) {}
+
+	__device__ __forceinline__ uint32_t operator()(const uint32_t& a) const {
+		// return _vcmpgts2(a, compare);
+		auto result = __vcmpgts2(a, compare);
+
+		result = result & 0b00000000000000010000000000000001; // todo
+		// printf("%u\n", result);
+		return result;
+	}
+};
+
+struct LessThan_int_16_2 {
+	uint32_t compare;
+
+	__device__ __forceinline__ LessThan_int_16_2(uint32_t a_compare)
+	    : compare(a_compare) {}
+
+	__device__ __forceinline__ uint32_t operator()(const uint32_t& a) const {
+		// return _vcmpgts2(a, compare);
+		return __vcmplts2(a, compare);
+	}
+};
+
+template <typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect_int_16_2(int      tid,
+                                                         uint32_t (&items)[ITEMS_PER_THREAD],
+                                                         SelectOp select_op,
+                                                         uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                         int      num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD / 2; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items / 2) { selection_flags[ITEM] = select_op(items[ITEM]); }
+	}
+}
+
+template <typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect_int_16_2(int      tid,
+                                                         uint32_t (&items)[ITEMS_PER_THREAD],
+                                                         SelectOp select_op,
+                                                         uint32_t (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD / 2; ITEM++) {
+		selection_flags[ITEM] = select_op(items[ITEM]);
+	}
+}
+
+template <typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect_int_16_2(uint32_t (&items)[ITEMS_PER_THREAD],
+                                                         SelectOp select_op,
+                                                         uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                         int      num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredDirect_int_16_2<SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredDirect_int_16_2<SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredGT_int_16_2(uint32_t (&items)[ITEMS_PER_THREAD],
+                                                     uint32_t compare,
+                                                     uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                     int      num_items) {
+	GreaterThan_int_16_2 select_op(compare);
+	BlockPredDirect_int_16_2<GreaterThan_int_16_2, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, select_op, selection_flags, num_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLT_int_16_2(uint32_t (&items)[ITEMS_PER_THREAD],
+                                                     uint32_t compare,
+                                                     uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                     int      num_items) {
+	LessThan_int_16_2 select_op(compare);
+	BlockPredDirect_int_16_2<LessThan_int_16_2, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, select_op, selection_flags, num_items);
+}
+
+template <typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPred_int_16_2(uint32_t (&items)[ITEMS_PER_THREAD],
+                                                   SelectOp select_op,
+                                                   uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                   int      num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredDirect_int_16_2<SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredDirect_int_16_2<SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+struct LessThanX {
+	uint32_t compare;
+
+	__device__ __forceinline__ LessThanX(uint32_t a_compare)
+	    : compare(a_compare) {}
+
+	__device__ __forceinline__ uint32_t operator()(const uint32_t& a) const {
+		// return _vcmpgts2(a, compare);
+		return __vcmplts2(a, compare);
+	}
+};
+
+struct LessThanEqX {
+	uint32_t compare;
+
+	__device__ __forceinline__ LessThanEqX(uint32_t a_compare)
+	    : compare(a_compare) {}
+
+	__device__ __forceinline__ uint32_t operator()(const uint32_t& a) const {
+		// return _vcmpgts2(a, compare);
+		return __vcmpleu2(a, compare);
+	}
+};
+
+struct GreaterThanEqX {
+	uint32_t compare;
+
+	__device__ __forceinline__ GreaterThanEqX(uint32_t a_compare)
+	    : compare(a_compare) {}
+
+	__device__ __forceinline__ uint32_t operator()(const uint32_t& a) const {
+		// return _vcmpgts2(a, compare);
+		return __vcmpgeu2(a, compare);
+	}
+};
+
+template <typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirectX(int      tid,
+                                                    uint32_t (&items)[ITEMS_PER_THREAD],
+                                                    SelectOp select_op,
+                                                    uint32_t (&selection_flags)[ITEMS_PER_THREAD]) {
+	// printf("not mini\n");
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD / 2; ITEM++) {
+		selection_flags[ITEM] = selection_flags[ITEM] & select_op(items[ITEM]);
+	}
+}
+
+template <typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirectX(int      tid,
+                                                    uint32_t (&items)[ITEMS_PER_THREAD],
+                                                    SelectOp select_op,
+                                                    uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                    int      num_items) {
+	// printf("mini\n");
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD / 2; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items / 2 + 1) {
+			selection_flags[ITEM] = selection_flags[ITEM] & select_op(items[ITEM]);
+		}
+	}
+}
+
+template <typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndX(uint32_t (&items)[ITEMS_PER_THREAD],
+                                              SelectOp select_op,
+                                              uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                              int      num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredAndDirectX<SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredAndDirectX<SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLTX(uint32_t (&items)[ITEMS_PER_THREAD],
+                                                uint32_t compare,
+                                                uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                int      num_items) {
+	LessThanX select_op(compare);
+	BlockPredAndX<LessThanX, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLTEX(uint32_t (&items)[ITEMS_PER_THREAD],
+                                                 uint32_t compare,
+                                                 uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                 int      num_items) {
+	LessThanEqX select_op(compare);
+	BlockPredAndX<LessThanEqX, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndGTEX(uint32_t (&items)[ITEMS_PER_THREAD],
+                                                 uint32_t compare,
+                                                 uint32_t (&selection_flags)[ITEMS_PER_THREAD],
+                                                 int      num_items) {
+	GreaterThanEqX select_op(compare);
+	BlockPredAndX<GreaterThanEqX, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
diff --git a/fastlanes/src/include/crystal-opt/reduce.cuh b/fastlanes/src/include/crystal-opt/reduce.cuh
new file mode 100644
index 0000000..1f08282
--- /dev/null
+++ b/fastlanes/src/include/crystal-opt/reduce.cuh
@@ -0,0 +1,75 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(
+    T  item,
+    T* shared
+    ) {
+  __syncthreads();
+
+  T val = item;
+  const int warp_size = 32;
+  int lane = threadIdx.x % warp_size;
+  int wid = threadIdx.x / warp_size;
+
+  // Calculate sum across warp
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(0xffffffff, val, offset);
+  }
+
+  // Store sum in buffer
+  if (lane == 0) {
+    shared[wid] = val;
+  }
+
+  __syncthreads();
+
+  // Load the sums into the first warp
+  val = (threadIdx.x < blockDim.x / warp_size) ? shared[lane] : 0;
+
+  // Calculate sum of sums
+  if (wid == 0) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+      val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+  }
+
+  return val;
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(
+    T (&items)[ITEMS_PER_THREAD],
+    T* shared
+    ) {
+  T thread_sum = 0;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    thread_sum += items[ITEM];
+  }
+
+  return BlockSum(thread_sum, shared);
+}
diff --git a/fastlanes/src/include/crystal-opt/store.cuh b/fastlanes/src/include/crystal-opt/store.cuh
new file mode 100644
index 0000000..a5de94f
--- /dev/null
+++ b/fastlanes/src/include/crystal-opt/store.cuh
@@ -0,0 +1,120 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* out,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = out;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
+
diff --git a/fastlanes/src/include/crystal-opt/term.cuh b/fastlanes/src/include/crystal-opt/term.cuh
new file mode 100644
index 0000000..1e3a5fc
--- /dev/null
+++ b/fastlanes/src/include/crystal-opt/term.cuh
@@ -0,0 +1,33 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ bool
+IsTerm(int (&selection_flags)[ITEMS_PER_THREAD]) {
+    int count = 0;
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+        count += selection_flags[ITEM];
+    }
+    return count == 0;
+}
diff --git a/fastlanes/src/include/crystal/crystal.cuh b/fastlanes/src/include/crystal/crystal.cuh
new file mode 100644
index 0000000..111628a
--- /dev/null
+++ b/fastlanes/src/include/crystal/crystal.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+// Block-wide functions
+#include "join.cuh"
+#include "load.cuh"
+#include "pred.cuh"
+#include "reduce.cuh"
+#include "store.cuh"
+#include "term.cuh"
diff --git a/fastlanes/src/include/crystal/join.cuh b/fastlanes/src/include/crystal/join.cuh
new file mode 100644
index 0000000..b6110fa
--- /dev/null
+++ b/fastlanes/src/include/crystal/join.cuh
@@ -0,0 +1,275 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#define HASH(X, Y, Z) ((X - Z) % Y)
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid, K (&items)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], K* ht, int ht_len, K keys_min) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (selection_flags[ITEM]) {
+			int hash = HASH(items[ITEM], ht_len, keys_min);
+
+			K slot = ht[hash];
+			if (slot != 0) {
+				selection_flags[ITEM] = 1;
+			} else {
+				selection_flags[ITEM] = 0;
+			}
+		}
+	}
+}
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(int tid,
+                                                         K   (&items)[ITEMS_PER_THREAD],
+                                                         int (&selection_flags)[ITEMS_PER_THREAD],
+                                                         K*  ht,
+                                                         int ht_len,
+                                                         K   keys_min,
+                                                         int num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			if (selection_flags[ITEM]) {
+				int hash = HASH(items[ITEM], ht_len, keys_min);
+
+				K slot = ht[hash];
+				if (slot != 0) {
+					selection_flags[ITEM] = 1;
+				} else {
+					selection_flags[ITEM] = 0;
+				}
+			}
+		}
+	}
+}
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(K   (&items)[ITEMS_PER_THREAD],
+                                                   int (&selection_flags)[ITEMS_PER_THREAD],
+                                                   K*  ht,
+                                                   int ht_len,
+                                                   K   keys_min,
+                                                   int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, selection_flags, ht, ht_len, keys_min);
+	} else {
+		BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, selection_flags, ht, ht_len, keys_min, num_items);
+	}
+}
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K (&items)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], K* ht, int ht_len, int num_items) {
+	BlockProbeAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(int tid,
+                                                         K   (&keys)[ITEMS_PER_THREAD],
+                                                         V   (&res)[ITEMS_PER_THREAD],
+                                                         int (&selection_flags)[ITEMS_PER_THREAD],
+                                                         K*  ht,
+                                                         int ht_len,
+                                                         K   keys_min) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (selection_flags[ITEM]) {
+			int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+			uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+			if (slot != 0) {
+				res[ITEM] = (slot >> 32);
+			} else {
+				selection_flags[ITEM] = 0;
+			}
+		}
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(int tid,
+                                                         K   (&items)[ITEMS_PER_THREAD],
+                                                         V   (&res)[ITEMS_PER_THREAD],
+                                                         int (&selection_flags)[ITEMS_PER_THREAD],
+                                                         K*  ht,
+                                                         int ht_len,
+                                                         K   keys_min,
+                                                         int num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			if (selection_flags[ITEM]) {
+				int hash = HASH(items[ITEM], ht_len, keys_min);
+
+				uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+				if (slot != 0) {
+					res[ITEM] = (slot >> 32);
+				} else {
+					selection_flags[ITEM] = 0;
+				}
+			}
+		}
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(K   (&keys)[ITEMS_PER_THREAD],
+                                                   V   (&res)[ITEMS_PER_THREAD],
+                                                   int (&selection_flags)[ITEMS_PER_THREAD],
+                                                   K*  ht,
+                                                   int ht_len,
+                                                   K   keys_min,
+                                                   int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+	} else {
+		BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(K   (&keys)[ITEMS_PER_THREAD],
+                                                   V   (&res)[ITEMS_PER_THREAD],
+                                                   int (&selection_flags)[ITEMS_PER_THREAD],
+                                                   K*  ht,
+                                                   int ht_len,
+                                                   int num_items) {
+	BlockProbeAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid, K (&keys)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], K* ht, int ht_len, K keys_min) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (selection_flags[ITEM]) {
+			int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+			K old = atomicCAS(&ht[hash], 0, keys[ITEM]);
+		}
+	}
+}
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(int tid,
+                                                               K   (&items)[ITEMS_PER_THREAD],
+                                                               int (&selection_flags)[ITEMS_PER_THREAD],
+                                                               K*  ht,
+                                                               int ht_len,
+                                                               K   keys_min,
+                                                               int num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			if (selection_flags[ITEM]) {
+				int hash = HASH(items[ITEM], ht_len, keys_min);
+
+				K old = atomicCAS(&ht[hash], 0, items[ITEM]);
+			}
+		}
+	}
+}
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(K   (&keys)[ITEMS_PER_THREAD],
+                                                         int (&selection_flags)[ITEMS_PER_THREAD],
+                                                         K*  ht,
+                                                         int ht_len,
+                                                         K   keys_min,
+                                                         int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, selection_flags, ht, ht_len, keys_min);
+	} else {
+		BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, selection_flags, ht, ht_len, keys_min, num_items);
+	}
+}
+
+template <typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K (&keys)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], K* ht, int ht_len, int num_items) {
+	BlockBuildSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(int tid,
+                                                               K   (&keys)[ITEMS_PER_THREAD],
+                                                               V   (&res)[ITEMS_PER_THREAD],
+                                                               int (&selection_flags)[ITEMS_PER_THREAD],
+                                                               K*  ht,
+                                                               int ht_len,
+                                                               K   keys_min) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (selection_flags[ITEM]) {
+			int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+			K old               = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+			ht[(hash << 1) + 1] = res[ITEM];
+		}
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(int tid,
+                                                               K   (&keys)[ITEMS_PER_THREAD],
+                                                               V   (&res)[ITEMS_PER_THREAD],
+                                                               int (&selection_flags)[ITEMS_PER_THREAD],
+                                                               K*  ht,
+                                                               int ht_len,
+                                                               K   keys_min,
+                                                               int num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			if (selection_flags[ITEM]) {
+				int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+				K old               = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+				ht[(hash << 1) + 1] = res[ITEM];
+			}
+		}
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(K   (&keys)[ITEMS_PER_THREAD],
+                                                         V   (&res)[ITEMS_PER_THREAD],
+                                                         int (&selection_flags)[ITEMS_PER_THREAD],
+                                                         K*  ht,
+                                                         int ht_len,
+                                                         K   keys_min,
+                                                         int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+	} else {
+		BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(K   (&keys)[ITEMS_PER_THREAD],
+                                                         V   (&res)[ITEMS_PER_THREAD],
+                                                         int (&selection_flags)[ITEMS_PER_THREAD],
+                                                         K*  ht,
+                                                         int ht_len,
+                                                         int num_items) {
+	BlockBuildSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
diff --git a/fastlanes/src/include/crystal/load.cuh b/fastlanes/src/include/crystal/load.cuh
new file mode 100644
index 0000000..eb4c0e2
--- /dev/null
+++ b/fastlanes/src/include/crystal/load.cuh
@@ -0,0 +1,210 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoadDirect(const unsigned int tid, T *block_itr,
+                    T (&items)[ITEMS_PER_THREAD],
+                    int (&selection_flags)[ITEMS_PER_THREAD]) {
+  T *thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoadDirect(const unsigned int tid, T *block_itr,
+                    T (&items)[ITEMS_PER_THREAD], int num_items,
+                    int (&selection_flags)[ITEMS_PER_THREAD]) {
+  T *thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+      }
+    }
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoad(T *inp, T (&items)[ITEMS_PER_THREAD], int num_items,
+              int (&selection_flags)[ITEMS_PER_THREAD]) {
+  T *block_itr = inp;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, block_itr, items, selection_flags);
+  } else {
+    BlockPredLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, block_itr, items, num_items, selection_flags);
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = inp;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLoadDirect(const unsigned int tid,
+                                                    T*                 block_itr,
+                                                    T                  (&items)[ITEMS_PER_THREAD],
+                                                    ST                 (&selection_flags)[ITEMS_PER_THREAD]) {
+  T* thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) { items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; }
+  }
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLoadDirect(const unsigned int tid,
+                                                    T*                 block_itr,
+                                                    T                  (&items)[ITEMS_PER_THREAD],
+                                                    int                num_items,
+                                                    ST                 (&selection_flags)[ITEMS_PER_THREAD]) {
+  T* thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      if (tid + (ITEM * BLOCK_THREADS) < num_items) { items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; }
+    }
+  }
+}
+
+template <typename T, typename ST, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLoad(T* inp, T (&items)[ITEMS_PER_THREAD], int num_items, ST (&selection_flags)[ITEMS_PER_THREAD]) {
+  T* block_itr = inp;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredLoadDirect<T, ST, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, selection_flags);
+  } else {
+    BlockPredLoadDirect<T, ST, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, block_itr, items, num_items, selection_flags);
+  }
+}
diff --git a/fastlanes/src/include/crystal/pred.cuh b/fastlanes/src/include/crystal/pred.cuh
new file mode 100644
index 0000000..2aa7386
--- /dev/null
+++ b/fastlanes/src/include/crystal/pred.cuh
@@ -0,0 +1,246 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void InitFlags(int (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = 1;
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredDirect(int tid, T (&items)[ITEMS_PER_THREAD], SelectOp select_op, int (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = select_op(items[ITEM]);
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(int      tid,
+                                                T        (&items)[ITEMS_PER_THREAD],
+                                                SelectOp select_op,
+                                                int      (&selection_flags)[ITEMS_PER_THREAD],
+                                                int      num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) { selection_flags[ITEM] = select_op(items[ITEM]); }
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPred(T (&items)[ITEMS_PER_THREAD], SelectOp select_op, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(int      tid,
+                                                   T        (&items)[ITEMS_PER_THREAD],
+                                                   SelectOp select_op,
+                                                   int      (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(int      tid,
+                                                   T        (&items)[ITEMS_PER_THREAD],
+                                                   SelectOp select_op,
+                                                   int      (&selection_flags)[ITEMS_PER_THREAD],
+                                                   int      num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+		}
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAnd(T        (&items)[ITEMS_PER_THREAD],
+                                             SelectOp select_op,
+                                             int      (&selection_flags)[ITEMS_PER_THREAD],
+                                             int      num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredOrDirect(int tid, T (&items)[ITEMS_PER_THREAD], SelectOp select_op, int (&selection_flags)[ITEMS_PER_THREAD]) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(int      tid,
+                                                  T        (&items)[ITEMS_PER_THREAD],
+                                                  SelectOp select_op,
+                                                  int      (&selection_flags)[ITEMS_PER_THREAD],
+                                                  int      num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+		}
+	}
+}
+
+template <typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredOr(T (&items)[ITEMS_PER_THREAD], SelectOp select_op, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+	} else {
+		BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, items, select_op, selection_flags, num_items);
+	}
+}
+
+template <typename T>
+struct LessThan {
+	T compare;
+
+	__device__ __forceinline__ LessThan(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a < compare); }
+};
+
+template <typename T>
+struct GreaterThan {
+	T compare;
+
+	__device__ __forceinline__ GreaterThan(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a > compare); }
+};
+
+template <typename T>
+struct LessThanEq {
+	T compare;
+
+	__device__ __forceinline__ LessThanEq(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a <= compare); }
+};
+
+template <typename T>
+struct GreaterThanEq {
+	T compare;
+
+	__device__ __forceinline__ GreaterThanEq(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a >= compare); }
+};
+
+template <typename T>
+struct Eq {
+	T compare;
+
+	__device__ __forceinline__ Eq(T compare)
+	    : compare(compare) {}
+
+	__device__ __forceinline__ bool operator()(const T& a) const { return (a == compare); }
+};
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLT(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThan<T> select_op(compare);
+	BlockPred<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndLT(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThan<T> select_op(compare);
+	BlockPredAnd<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredGT(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThan<T> select_op(compare);
+	BlockPred<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndGT(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThan<T> select_op(compare);
+	BlockPredAnd<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredLTE(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThanEq<T> select_op(compare);
+	BlockPred<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndLTE(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	LessThanEq<T> select_op(compare);
+	BlockPredAnd<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredGTE(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThanEq<T> select_op(compare);
+	BlockPred<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndGTE(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	GreaterThanEq<T> select_op(compare);
+	BlockPredAnd<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredEQ(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	Eq<T> select_op(compare);
+	BlockPred<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredAndEQ(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	Eq<T> select_op(compare);
+	BlockPredAnd<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+BlockPredOrEQ(T (&items)[ITEMS_PER_THREAD], T compare, int (&selection_flags)[ITEMS_PER_THREAD], int num_items) {
+	Eq<T> select_op(compare);
+	BlockPredOr<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
diff --git a/fastlanes/src/include/crystal/reduce.cuh b/fastlanes/src/include/crystal/reduce.cuh
new file mode 100644
index 0000000..66de689
--- /dev/null
+++ b/fastlanes/src/include/crystal/reduce.cuh
@@ -0,0 +1,45 @@
+#pragma once
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(T item, T* shared) {
+	__syncthreads();
+
+	T         val       = item;
+	const int warp_size = 32;
+	int       lane      = threadIdx.x % warp_size;
+	int       wid       = threadIdx.x / warp_size;
+
+	// Calculate sum across warp
+	for (int offset = 16; offset > 0; offset /= 2) {
+		val += __shfl_down_sync(0xffffffff, val, offset);
+	}
+
+	// Store sum in buffer
+	if (lane == 0) { shared[wid] = val; }
+
+	__syncthreads();
+
+	// Load the sums into the first warp
+	val = (threadIdx.x < blockDim.x / warp_size) ? shared[lane] : 0;
+
+	// Calculate sum of sums
+	if (wid == 0) {
+		for (int offset = 16; offset > 0; offset /= 2) {
+			val += __shfl_down_sync(0xffffffff, val, offset);
+		}
+	}
+
+	return val;
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(T (&items)[ITEMS_PER_THREAD], T* shared) {
+	T thread_sum = 0;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		thread_sum += items[ITEM];
+	}
+
+	return BlockSum(thread_sum, shared);
+}
diff --git a/fastlanes/src/include/crystal/store.cuh b/fastlanes/src/include/crystal/store.cuh
new file mode 100644
index 0000000..01231e4
--- /dev/null
+++ b/fastlanes/src/include/crystal/store.cuh
@@ -0,0 +1,82 @@
+#pragma once
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(int tid, T* block_itr, T (&items)[ITEMS_PER_THREAD]) {
+	T* thread_itr = block_itr + tid;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(int tid, T* block_itr, T (&items)[ITEMS_PER_THREAD], int num_items) {
+	T* thread_itr = block_itr + tid;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) { thread_itr[ITEM * BLOCK_THREADS] = items[ITEM]; }
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(T* out, T (&items)[ITEMS_PER_THREAD], int num_items) {
+	T* block_itr = out;
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+	} else {
+		BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+	}
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
diff --git a/fastlanes/src/include/crystal/term.cuh b/fastlanes/src/include/crystal/term.cuh
new file mode 100644
index 0000000..1e3a5fc
--- /dev/null
+++ b/fastlanes/src/include/crystal/term.cuh
@@ -0,0 +1,33 @@
+// MIT License
+
+// Copyright (c) 2023 Jiashen Cao
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ bool
+IsTerm(int (&selection_flags)[ITEMS_PER_THREAD]) {
+    int count = 0;
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+        count += selection_flags[ITEM];
+    }
+    return count == 0;
+}
diff --git a/fastlanes/src/include/crystal_ssb_utils.h b/fastlanes/src/include/crystal_ssb_utils.h
new file mode 100644
index 0000000..05cf024
--- /dev/null
+++ b/fastlanes/src/include/crystal_ssb_utils.h
@@ -0,0 +1,136 @@
+#include <fstream>
+#include <iostream>
+#include <string>
+
+/*#include <cuda.h>*/
+/*#include <cub/util_allocator.cuh>*/
+
+using namespace std;
+
+#define SF        10
+#define BASE_PATH ""
+
+#if SF == 1
+#define DATA_DIR BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/s1_columnar/"
+#define LO_LEN   6001171
+#define P_LEN    200000
+#define S_LEN    2000
+#define C_LEN    30000
+#define D_LEN    2556
+#elif SF == 10
+#if defined SORTED
+#define DATA_DIR BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/s10_columnar_sorted/"
+#else
+#define DATA_DIR BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/s10_columnar/"
+#endif
+
+#define LO_LEN 59986214
+#define P_LEN  800000
+#define S_LEN  20000
+#define C_LEN  300000
+#define D_LEN  2556
+#else // 20
+#define DATA_DIR BASE_PATH "s20_columnar/"
+#define LO_LEN   119994746
+#define P_LEN    1000000
+#define S_LEN    40000
+#define C_LEN    600000
+#define D_LEN    2556
+#endif
+
+int index_of(string* arr, int len, string val) {
+	for (int i = 0; i < len; i++)
+		if (arr[i] == val) return i;
+
+	return -1;
+}
+
+string lookup(string col_name) {
+	string lineorder[] = {"lo_orderkey",
+	                      "lo_linenumber",
+	                      "lo_custkey",
+	                      "lo_partkey",
+	                      "lo_suppkey",
+	                      "lo_orderdate",
+	                      "lo_orderpriority",
+	                      "lo_shippriority",
+	                      "lo_quantity",
+	                      "lo_extendedprice",
+	                      "lo_ordtotalprice",
+	                      "lo_discount",
+	                      "lo_revenue",
+	                      "lo_supplycost",
+	                      "lo_tax",
+	                      "lo_commitdate",
+	                      "lo_shipmode"};
+	string part[]      = {
+        "p_partkey", "p_name", "p_mfgr", "p_category", "p_brand1", "p_color", "p_type", "p_size", "p_container"};
+	string supplier[] = {"s_suppkey", "s_name", "s_address", "s_city", "s_nation", "s_region", "s_phone"};
+	string customer[] = {
+	    "c_custkey", "c_name", "c_address", "c_city", "c_nation", "c_region", "c_phone", "c_mktsegment"};
+	string date[] = {"d_datekey",
+	                 "d_date",
+	                 "d_dayofweek",
+	                 "d_month",
+	                 "d_year",
+	                 "d_yearmonthnum",
+	                 "d_yearmonth",
+	                 "d_daynuminweek",
+	                 "d_daynuminmonth",
+	                 "d_daynuminyear",
+	                 "d_sellingseason",
+	                 "d_lastdayinweekfl",
+	                 "d_lastdayinmonthfl",
+	                 "d_holidayfl",
+	                 "d_weekdayfl"};
+
+	if (col_name[0] == 'l') {
+		int index = index_of(lineorder, 17, col_name);
+		return "LINEORDER" + to_string(index);
+	} else if (col_name[0] == 's') {
+		int index = index_of(supplier, 7, col_name);
+		return "SUPPLIER" + to_string(index);
+	} else if (col_name[0] == 'c') {
+		int index = index_of(customer, 8, col_name);
+		return "CUSTOMER" + to_string(index);
+	} else if (col_name[0] == 'p') {
+		int index = index_of(part, 9, col_name);
+		return "PART" + to_string(index);
+	} else if (col_name[0] == 'd') {
+		int index = index_of(date, 15, col_name);
+		return "DDATE" + to_string(index);
+	}
+
+	return "";
+}
+
+template <typename T>
+T* loadColumn(string col_name, int num_entries) {
+	T*       h_col    = new T[num_entries];
+	string   filename = DATA_DIR + lookup(col_name);
+	ifstream colData(filename.c_str(), ios::in | ios::binary);
+	if (!colData) { throw std::runtime_error(filename.c_str()); }
+
+	colData.read((char*)h_col, num_entries * sizeof(T));
+	return h_col;
+}
+
+template <typename T>
+int storeColumn(string col_name, int num_entries, int* h_col) {
+	string   filename = DATA_DIR + lookup(col_name);
+	ofstream colData(filename.c_str(), ios::out | ios::binary);
+	if (!colData) { return -1; }
+
+	colData.write((char*)h_col, num_entries * sizeof(T));
+	return 0;
+}
+
+/*int main() {*/
+// int *h_col = new int[10];
+// for (int i=0; i<10; i++) h_col[i] = i;
+// storeColumn<int>("test", 10, h_col);
+// int *l_col = loadColumn<int>("test", 10);
+// for (int i=0; i<10; i++) cout << l_col[i] << " ";
+// cout << endl;
+// return 0;
+/*}*/
diff --git a/fastlanes/src/include/debug.cuh b/fastlanes/src/include/debug.cuh
new file mode 100644
index 0000000..b3a4806
--- /dev/null
+++ b/fastlanes/src/include/debug.cuh
@@ -0,0 +1,26 @@
+#ifndef DEBUG_CUH
+#define DEBUG_CUH
+
+#include <cuda_runtime.h>
+#define PRINT_GPU(...) fastlanes::gpu::debug::print_gpu(__VA_ARGS__)
+
+namespace fastlanes::gpu::debug {
+template <typename T>
+__device__ void print_gpu(T* arr, const char* str) {
+	__syncthreads();
+	if (threadIdx.x == 0) {
+		printf("\n ==================   %s   ================= \n ", str);
+
+		for (int ITEM = 0; ITEM < 1024; ++ITEM) {
+			if (ITEM % 128 == 0) { printf("\n"); }
+			printf(" %d | ", arr[ITEM]);
+		}
+
+		printf("\n");
+	}
+	__syncthreads();
+}
+
+} // namespace fastlanes::gpu::debug
+
+#endif // DEBUG_CUH
diff --git a/fastlanes/src/include/debug.hpp b/fastlanes/src/include/debug.hpp
new file mode 100644
index 0000000..bb9124d
--- /dev/null
+++ b/fastlanes/src/include/debug.hpp
@@ -0,0 +1,94 @@
+#ifndef DEBUG_HPP
+#define DEBUG_HPP
+
+#ifndef FLS_DEBUG_COLOR_HPP
+#define FLS_DEBUG_COLOR_HPP
+
+#include <cstdint>
+#include <ostream>
+
+namespace fastlanes::debug {
+enum Code : uint32_t {
+	FG_BLACK   = 30,
+	FG_RED     = 31,
+	FG_GREEN   = 32,
+	FG_YELLOW  = 33,
+	FG_BLUE    = 34,
+	FG_MAGENTA = 35,
+	FG_CYAN    = 36,
+	FG_WHITE   = 37,
+	FG_DEFAULT = 39,
+	BG_RED     = 41,
+	BG_GREEN   = 42,
+	BG_BLUE    = 44,
+	BG_DEFAULT = 49
+
+};
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& reset(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[" << FG_BLACK << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& black(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[" << FG_BLACK << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& bold_black(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[1m\033[" << FG_BLACK << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& bold_blue(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[1m\033[" << FG_BLUE << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& red(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[" << FG_RED << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& magenta(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[" << FG_MAGENTA << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& yellow(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[" << FG_YELLOW << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& def(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[" << FG_DEFAULT << "m";
+}
+
+template <class CHAR_T, class TRAITS>
+constexpr std::basic_ostream<CHAR_T, TRAITS>& green(std::basic_ostream<CHAR_T, TRAITS>& os) {
+	return os << "\033[" << FG_GREEN << "m";
+}
+} // namespace fastlanes::debug
+#endif // FLS_DEBUG_COLOR_HPP
+
+#define FLS_SHOW(a)                                                                                                    \
+	std::cout << fastlanes::debug::yellow << "-- " << #a << ": " << (a) << fastlanes::debug::def << '\n';
+#define FLS_LOG(m)     std::cout << fastlanes::debug::yellow << "-- " << m << fastlanes::debug::def << '\n';
+#define FLS_CERR(a)    std::cout << fastlanes::debug::red << "-- " << #a << ": " << (a) << fastlanes::debug::def << '\n';
+#define FLS_SUCCESS(m) std::cout << fastlanes::debug::green << "-- " << m << fastlanes::debug::def << '\n';
+#define FLS_RESULT(m)  std::cout << fastlanes::debug::bold_blue << "-- " << m << fastlanes::debug::def << '\n';
+
+template <typename T>
+void PRINT(T* arr, const char* str) {
+	printf("\n ==================   %s   ================= \n ", str);
+
+	for (int ITEM = 0; ITEM < 1024; ++ITEM) {
+		if (ITEM % 128 == 0) { printf("\n"); }
+		printf(" %d | ", arr[ITEM]);
+	}
+
+	printf("\n");
+}
+
+#endif // DEBUG_HPP
diff --git a/fastlanes/src/include/error.cuh b/fastlanes/src/include/error.cuh
new file mode 100644
index 0000000..da14716
--- /dev/null
+++ b/fastlanes/src/include/error.cuh
@@ -0,0 +1,26 @@
+#ifndef FLS_GPU_ERROR_CUH
+#define FLS_GPU_ERROR_CUH
+
+#include <iostream>
+#include <stdio.h>
+
+#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
+inline void check(const cudaError_t err, const char* const func, const char* const file, const int line) {
+	if (err != cudaSuccess) {
+		std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+		std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+}
+
+#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
+inline void checkLast(const char* const file, const int line) {
+	cudaError_t const err {cudaGetLastError()};
+	if (err != cudaSuccess) {
+		std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+		std::cerr << cudaGetErrorString(err) << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+}
+
+#endif // FLS_GPU_ERROR_CUH
diff --git a/fastlanes/src/include/fastlanes.cuh b/fastlanes/src/include/fastlanes.cuh
new file mode 100644
index 0000000..ea7b781
--- /dev/null
+++ b/fastlanes/src/include/fastlanes.cuh
@@ -0,0 +1,12 @@
+#ifndef FLS_GPU_READER_GPU_CUH
+#define FLS_GPU_READER_GPU_CUH
+
+#include "common.cuh"
+#include "error.cuh"
+#include "gpu_utils.h"
+
+namespace fastlanes::gpu {
+
+} // namespace fastlanes::gpu
+
+#endif // FLS_GPU_READER_GPU_CUH
diff --git a/fastlanes/src/include/fastlanes/join.cuh b/fastlanes/src/include/fastlanes/join.cuh
new file mode 100644
index 0000000..a27b388
--- /dev/null
+++ b/fastlanes/src/include/fastlanes/join.cuh
@@ -0,0 +1,82 @@
+#pragma once
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2_R_S(int tid,
+                                                             K   (&keys)[ITEMS_PER_THREAD],
+                                                             V   (&res)[BLOCK_THREADS * ITEMS_PER_THREAD],
+                                                             int (&selection_flags)[ITEMS_PER_THREAD],
+                                                             K*  ht,
+                                                             int ht_len,
+                                                             K   keys_min) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		auto shared_idx = BLOCK_THREADS * ITEM + threadIdx.x;
+
+		if (selection_flags[ITEM]) {
+			int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+			uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+			if (slot != 0) {
+				res[shared_idx] = (slot >> 32);
+			} else {
+				selection_flags[ITEM] = 0;
+			}
+		}
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2_R_S(int tid,
+                                                             K   (&items)[ITEMS_PER_THREAD],
+                                                             V   (&res)[BLOCK_THREADS * ITEMS_PER_THREAD],
+                                                             int (&selection_flags)[ITEMS_PER_THREAD],
+                                                             K*  ht,
+                                                             int ht_len,
+                                                             K   keys_min,
+                                                             int num_items) {
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		auto shared_idx = BLOCK_THREADS * ITEM + threadIdx.x;
+
+		if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+			if (selection_flags[ITEM]) {
+				int hash = HASH(items[ITEM], ht_len, keys_min);
+
+				uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+				if (slot != 0) {
+					res[shared_idx] = (slot >> 32);
+				} else {
+					selection_flags[ITEM] = 0;
+				}
+			}
+		}
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2_R_S(K   (&keys)[ITEMS_PER_THREAD],
+                                                       V   (&res)[BLOCK_THREADS * ITEMS_PER_THREAD],
+                                                       int (&selection_flags)[ITEMS_PER_THREAD],
+                                                       K*  ht,
+                                                       int ht_len,
+                                                       K   keys_min,
+                                                       int num_items) {
+
+	if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+		BlockProbeDirectAndPHT_2_R_S<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+	} else {
+		BlockProbeDirectAndPHT_2_R_S<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+		    threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+	}
+}
+
+template <typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2_R_S(K   (&keys)[ITEMS_PER_THREAD],
+                                                       V   (&res)[BLOCK_THREADS * ITEMS_PER_THREAD],
+                                                       int (&selection_flags)[ITEMS_PER_THREAD],
+                                                       K*  ht,
+                                                       int ht_len,
+                                                       int num_items) {
+	BlockProbeAndPHT_2_R_S<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
\ No newline at end of file
diff --git a/fastlanes/src/include/fastlanes/pred.cuh b/fastlanes/src/include/fastlanes/pred.cuh
new file mode 100644
index 0000000..e69de29
diff --git a/fastlanes/src/include/fls_gen/macros.hpp b/fastlanes/src/include/fls_gen/macros.hpp
new file mode 100644
index 0000000..a8408ba
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/macros.hpp
@@ -0,0 +1,4 @@
+#ifndef MACROS_HPP
+#define MACROS_HPP
+
+#endif //MACROS_HPP
diff --git a/fastlanes/src/include/fls_gen/pack/pack.hpp b/fastlanes/src/include/fls_gen/pack/pack.hpp
new file mode 100644
index 0000000..48bc939
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/pack/pack.hpp
@@ -0,0 +1,27 @@
+#ifndef FLS_PACK_HPP
+#define FLS_PACK_HPP
+
+#include <cstdint>
+
+namespace generated { namespace pack { namespace fallback { namespace scalar {
+void pack(const uint64_t* __restrict in, uint64_t* __restrict out, uint8_t bw);
+void pack(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw);
+void pack(const uint16_t* __restrict in, uint16_t* __restrict out, uint8_t bw);
+void pack(const uint8_t* __restrict in, uint8_t* __restrict out, uint8_t bw);
+
+inline void pack(const int64_t* __restrict in, int64_t* __restrict out, uint8_t bw) {
+	pack(reinterpret_cast<const uint64_t*>(in), reinterpret_cast<uint64_t*>(out), bw);
+}
+inline void pack(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw) {
+	pack(reinterpret_cast<const uint32_t*>(in), reinterpret_cast<uint32_t*>(out), bw);
+}
+inline void pack(const int16_t* __restrict in, int16_t* __restrict out, uint8_t bw) {
+	pack(reinterpret_cast<const uint16_t*>(in), reinterpret_cast<uint16_t*>(out), bw);
+}
+inline void pack(const int8_t* __restrict in, int8_t* __restrict out, uint8_t bw) {
+	pack(reinterpret_cast<const uint8_t*>(in), reinterpret_cast<uint8_t*>(out), bw);
+}
+
+}}}} // namespace generated::pack::fallback::scalar
+
+#endif // FLS_PACK_HPP
diff --git a/fastlanes/src/include/fls_gen/rle/rle.hpp b/fastlanes/src/include/fls_gen/rle/rle.hpp
new file mode 100644
index 0000000..ba519ba
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/rle/rle.hpp
@@ -0,0 +1,45 @@
+#ifndef RLE_HPP
+#define RLE_HPP
+
+#include "debug.hpp"
+
+namespace fastlanes {
+
+template <int MINI_VEC_SIZE = 256, int MINI_VEC_N = (1024 / MINI_VEC_SIZE)>
+uint32_t RLE(const int32_t* const vec_arr, uint8_t* const idx_arr, int32_t* const dict_rle_val) {
+
+	auto cur_val {vec_arr[0]};
+	auto cur_idx {0};
+	idx_arr[0]      = 0;
+	dict_rle_val[0] = cur_val;
+	for (size_t i {1}; i < MINI_VEC_SIZE; ++i) {
+		const auto nex_val = vec_arr[i];
+		if (cur_val != nex_val) {
+			cur_idx               = cur_idx + 1;
+			dict_rle_val[cur_idx] = nex_val;
+			cur_val               = nex_val;
+		}
+		idx_arr[i] = cur_idx;
+	}
+
+	return cur_idx + 1;
+}
+
+template <int MINI_VEC_SIZE = 256, int MINI_VEC_N = (1024 / MINI_VEC_SIZE)>
+uint16_t VECTOR_RLE(const int32_t* const vec_p, uint8_t* const idx_p, int32_t* const dict_rle_val, uint16_t* dict_4_p) {
+
+	dict_4_p[0]       = 0;
+	dict_4_p[1]       = dict_4_p[0] + RLE(vec_p + 0, idx_p + 0, dict_rle_val + dict_4_p[0]);
+	dict_4_p[2]       = dict_4_p[1] + RLE(vec_p + 256, idx_p + 256, dict_rle_val + dict_4_p[1]);
+	dict_4_p[3]       = dict_4_p[2] + RLE(vec_p + 512, idx_p + 512, dict_rle_val + dict_4_p[2]);
+	const auto reuslt = RLE(vec_p + 768, idx_p + 768, dict_rle_val + dict_4_p[3]);
+
+	PRINT(dict_rle_val, "dict_rle_val");
+	PRINT(dict_4_p, "dict_4_p");
+
+	return reuslt;
+}
+
+} // namespace fastlanes
+
+#endif // RLE_HPP
diff --git a/fastlanes/src/include/fls_gen/rsum/rsum.cuh b/fastlanes/src/include/fls_gen/rsum/rsum.cuh
new file mode 100644
index 0000000..435ce7a
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/rsum/rsum.cuh
@@ -0,0 +1,107 @@
+
+# pragma once
+
+__device__ __forceinline__ void d_rsum_32(const uint32_t* in, uint32_t* out, const uint32_t* base) {
+	uint32_t trd_idx = threadIdx.x;
+	trd_idx          = trd_idx % 32;
+	uint32_t r_0;
+	uint32_t r_1;
+
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 0);
+	r_1                                 = base[trd_idx]; // TODO
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 0]   = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 128);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 128] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 256);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 256] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 384);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 384] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 512);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 512] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 640);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 640] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 768);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 768] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 896);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 896] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 64);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 64]  = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 192);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 192] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 320);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 320] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 448);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 448] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 576);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 576] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 704);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 704] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 832);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 832] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 960);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 960] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 32);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 32]  = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 160);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 160] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 288);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 288] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 416);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 416] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 544);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 544] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 672);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 672] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 800);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 800] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 928);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 928] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 96);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 96]  = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 224);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 224] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 352);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 352] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 480);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 480] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 608);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 608] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 736);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 736] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 864);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 864] = r_1;
+	r_0                                 = *(in + (0 * 32) + (trd_idx * 1) + 992);
+	r_1                                 = r_1 + r_0;
+	out[(trd_idx * 1) + (0 * 32) + 992] = r_1;
+}
diff --git a/fastlanes/src/include/fls_gen/transpose/transpose.hpp b/fastlanes/src/include/fls_gen/transpose/transpose.hpp
new file mode 100644
index 0000000..a203a66
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/transpose/transpose.hpp
@@ -0,0 +1,24 @@
+#ifndef TRANSPOSE_HPP
+#define TRANSPOSE_HPP
+
+#include <cstdint>
+#include <cstring>
+
+namespace generated::transpose::fallback::scalar {
+void        transpose_i(const double* __restrict in, double* __restrict out);
+void        transpose_i(const uint64_t* __restrict in, uint64_t* __restrict out);
+void        transpose_i(const uint32_t* __restrict in, uint32_t* __restrict out);
+void        transpose_i(const uint16_t* __restrict in, uint16_t* __restrict out);
+void        transpose_i(const uint8_t* __restrict in, uint8_t* __restrict out);
+void        transpose_o(const double* __restrict in, double* __restrict out);
+void        transpose_o(const uint64_t* __restrict in, uint64_t* __restrict out);
+void        transpose_o(const uint32_t* __restrict in, uint32_t* __restrict out);
+void        transpose_o(const uint16_t* __restrict in, uint16_t* __restrict out);
+void        transpose_o(const uint8_t* __restrict in, uint8_t* __restrict out);
+inline void transpose_i(const int32_t* __restrict in, int32_t* __restrict out) {
+	transpose_i(reinterpret_cast<const uint32_t*>(in), (reinterpret_cast<uint32_t*>(out)));
+}
+
+} // namespace generated::transpose::fallback::scalar
+
+#endif
diff --git a/fastlanes/src/include/fls_gen/unpack/hardcoded_16.cuh b/fastlanes/src/include/fls_gen/unpack/hardcoded_16.cuh
new file mode 100644
index 0000000..39cc6fe
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/unpack/hardcoded_16.cuh
@@ -0,0 +1,1276 @@
+// generated!
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// static constexpr uint8_t lo_orderdate_bw     = 16;
+// static constexpr uint8_t lo_extendedprice_bw = 24;
+// static constexpr uint8_t lo_quantity_bw      = 6;
+// static constexpr uint8_t lo_discount_bw      = 4;
+// static constexpr uint8_t lo_partkey_bw       = 20;
+// static constexpr uint8_t lo_suppkey_bw       = 15;
+// static constexpr uint8_t lo_revenue_bw       = 24;
+// static constexpr uint8_t lo_custkey_bw       = 19;
+// static constexpr uint8_t lo_supplycost_bw    = 17;
+
+namespace hardcoded_16 {
+
+__device__ __forceinline__ void unpack_16bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                           uint16_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint16_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[1]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[3]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[5]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[7]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[9]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[11]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[13]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[17]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[19]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[21]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[23]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[25]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[26]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[27]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[28]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[29]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[30]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[31]    = tmp_0;
+}
+__device__ __forceinline__ void unpack_4bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                          uint16_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint16_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[3]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[7]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[9]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[11]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[13]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[17]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[19]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[21]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[23]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[25]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[26]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[27]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[28]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[29]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[30]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[31]    = tmp_0;
+}
+__device__ __forceinline__ void unpack_24bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                           uint16_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint16_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[3]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[7]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[9]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[11]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[13]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[17]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[19]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[21]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[23]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[25]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[26]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[27]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[28]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[29]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[30] = tmp_0;
+	tmp_0   = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[31] = tmp_0;
+}
+
+__device__ __forceinline__ void unpack_6bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                          uint16_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint16_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 6) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[3]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[7]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[9]     = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[11]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[13]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 6) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[17]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[19]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[21]    = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[23]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[25]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[26] = tmp_0;
+	tmp_0   = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[27] = tmp_0;
+	tmp_0   = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[28] = tmp_0;
+	tmp_0   = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[29] = tmp_0;
+	tmp_0   = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[30] = tmp_0;
+	tmp_0   = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[31] = tmp_0;
+}
+
+__device__ __forceinline__ void unpack(const uint32_t* __restrict a_in_p, uint16_t* __restrict a_out_p, uint8_t bw) {
+	switch (bw) {
+	case 24:
+		unpack_24bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 4:
+		unpack_4bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 16:
+		unpack_16bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 6:
+		unpack_6bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void load_registers(int i, T* out, T* registers) {
+
+#pragma unroll
+	for (int j = 0; j < ITEMS_PER_THREAD; j++) {
+		out[j * BLOCK_THREADS + i] = registers[j];
+	}
+}
+
+__global__ void unpack_global(const uint32_t* __restrict in, uint16_t* __restrict out, uint8_t bw) {
+	int trd_idx = threadIdx.x;
+	int blc_idx = blockIdx.x;
+	in          = in + ((blc_idx * bw) << 5);
+	out         = out + (blc_idx << 10);
+
+	uint16_t registers[32];
+	unpack(in, registers, bw);
+	load_registers<uint16_t, 32, 32>(trd_idx, out, registers);
+}
+
+__device__ __forceinline__ void unpack_device(const uint32_t* __restrict in, uint16_t* __restrict out, uint8_t bw) {
+	unpack(in, out, bw);
+}
+
+__device__ __forceinline__ void unpack_device(const int32_t* __restrict in, uint16_t* __restrict out, uint8_t bw) {
+	unpack(reinterpret_cast<const uint32_t*>(in), reinterpret_cast<uint16_t*>(out), bw);
+}
+} // namespace hardcoded_16
+
+namespace unpack_8_at_a_time {
+__device__ void unpack_20bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 20) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[3]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[6] = tmp_0;
+	tmp_0  = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[7] = tmp_0;
+}
+
+__device__ __forceinline__ void unpack_16bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                           uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[1]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[3]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[5]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[7]     = tmp_0;
+}
+
+__device__ __forceinline__ void unpack_24bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                           uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[3]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[7]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[9]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[11]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[13]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[17]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[19]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[21]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[23]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[25]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[26]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[27]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[28]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[29]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[30] = tmp_0;
+	tmp_0   = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[31] = tmp_0;
+}
+
+__device__ __forceinline__ void unpack_0bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                          uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	out[0] = base_0;
+	out[1] = base_0;
+	out[2] = base_0;
+	out[3] = base_0;
+	out[4] = base_0;
+	out[5] = base_0;
+	out[6] = base_0;
+	out[7] = base_0;
+}
+
+__device__ __forceinline__ void unpack_8bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                          uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 8) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[3]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0      = (register_0) & ((1ULL << 8) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[7]     = tmp_0;
+}
+
+__device__ __forceinline__ void unpack_4bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                          uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[3]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[7]     = tmp_0;
+}
+
+inline __device__ void unpack_12bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 12) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[3]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[5] = tmp_0;
+	tmp_0  = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[6] = tmp_0;
+	tmp_0  = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[7] = tmp_0;
+}
+
+__device__ __forceinline__ void unpack(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p, uint8_t bw) {
+	switch (bw) {
+	case 0:
+		unpack_0bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 4:
+		unpack_4bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 8:
+		unpack_8bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 12:
+		unpack_12bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 16:
+		unpack_16bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 20:
+		unpack_20bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 24:
+		unpack_24bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	default:
+		printf("implement this bw! %u\n", bw);
+		asm("trap;");
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void load_registers(int i, T* out, T* registers) {
+
+#pragma unroll
+	for (int j = 0; j < ITEMS_PER_THREAD; j++) {
+		out[j * BLOCK_THREADS + i] = registers[j];
+	}
+}
+
+__global__ void unpack_global(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	int trd_idx = threadIdx.x;
+	int blc_idx = blockIdx.x;
+	in          = in + ((blc_idx * bw) << 5);
+	out         = out + (blc_idx << 10);
+
+	uint32_t registers[32];
+	unpack(in, registers, bw);
+	load_registers<uint32_t, 32, 32>(trd_idx, out, registers);
+}
+
+__device__ __forceinline__ void unpack_device(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	unpack(in, out, bw);
+}
+
+__device__ __forceinline__ void unpack_device(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw) {
+	unpack(reinterpret_cast<const uint32_t*>(in), reinterpret_cast<uint32_t*>(out), bw);
+}
+} // namespace unpack_8_at_a_time
+
+namespace hardcoded {
+
+__device__ __forceinline__ void unpack_16bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                           uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[1]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[3]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[5]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[7]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[9]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[11]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[13]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[17]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[19]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[21]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[23]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[25]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[26]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[27]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[28]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[29]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0      = (register_0) & ((1ULL << 16) - 1);
+	out[30]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[31]    = tmp_0;
+}
+__device__ __forceinline__ void unpack_4bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                          uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[3]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[7]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[9]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[11]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[13]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[17]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[19]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[21]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[23]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 4) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[25]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[26]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[27]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[28]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[29]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[30]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[31]    = tmp_0;
+}
+__device__ __forceinline__ void unpack_24bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                           uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[3]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[7]     = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[9]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[11]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[13]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[17]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[19]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[21]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[23]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[25]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[26]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[27]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[28]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[29]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[30] = tmp_0;
+	tmp_0   = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[31] = tmp_0;
+}
+
+__device__ __forceinline__ void unpack_6bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p,
+                                                          uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 6) - 1);
+	out[0]     = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[1]     = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[2]     = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[3]     = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[4]     = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[5]     = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[6]     = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[7]     = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[8]     = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[9]     = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[10]    = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[11]    = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[12]    = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[13]    = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[14]    = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[15]    = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 6) - 1);
+	out[16]    = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[17]    = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[18]    = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[19]    = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[20]    = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[21]    = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[22]    = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[23]    = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[24]    = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[25]    = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[26] = tmp_0;
+	tmp_0   = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[27] = tmp_0;
+	tmp_0   = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[28] = tmp_0;
+	tmp_0   = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[29] = tmp_0;
+	tmp_0   = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[30] = tmp_0;
+	tmp_0   = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[31] = tmp_0;
+}
+
+__device__ __forceinline__ void unpack(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p, uint8_t bw) {
+	switch (bw) {
+	case 24:
+		unpack_24bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 4:
+		unpack_4bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 16:
+		unpack_16bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 6:
+		unpack_6bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	}
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void load_registers(int i, T* out, T* registers) {
+
+#pragma unroll
+	for (int j = 0; j < ITEMS_PER_THREAD; j++) {
+		out[j * BLOCK_THREADS + i] = registers[j];
+	}
+}
+
+__global__ void unpack_global(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	int trd_idx = threadIdx.x;
+	int blc_idx = blockIdx.x;
+	in          = in + ((blc_idx * bw) << 5);
+	out         = out + (blc_idx << 10);
+
+	uint32_t registers[32];
+	unpack(in, registers, bw);
+	load_registers<uint32_t, 32, 32>(trd_idx, out, registers);
+}
+
+__device__ __forceinline__ void unpack_device(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	unpack(in, out, bw);
+}
+
+__device__ __forceinline__ void unpack_device(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw) {
+	unpack(reinterpret_cast<const uint32_t*>(in), reinterpret_cast<uint32_t*>(out), bw);
+}
+} // namespace hardcoded
+
+namespace NEW_IDEA {
+
+__device__ __forceinline__ void RLE_UNPACK(const uint8_t* __restrict a_in_p, uint8_t* __restrict a_out_p) {
+
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0   = *(in + i);
+	tmp_0        = (register_0) & (0b00000001000000010000000100000001);
+	out[i + 0]   = tmp_0;
+	tmp_0        = (register_0 >> 1) & (0b00000001000000010000000100000001);
+	out[i + 32]  = tmp_0;
+	tmp_0        = (register_0 >> 2) & (0b00000001000000010000000100000001);
+	out[i + 64]  = tmp_0;
+	tmp_0        = (register_0 >> 3) & (0b00000001000000010000000100000001);
+	out[i + 96]  = tmp_0;
+	tmp_0        = (register_0 >> 4) & (0b00000001000000010000000100000001);
+	out[i + 128] = tmp_0;
+	tmp_0        = (register_0 >> 5) & (0b00000001000000010000000100000001);
+	out[i + 160] = tmp_0;
+	tmp_0        = (register_0 >> 6) & (0b00000001000000010000000100000001);
+	out[i + 192] = tmp_0;
+	tmp_0        = (register_0 >> 7) & (0b00000001000000010000000100000001);
+	out[i + 224] = tmp_0;
+}
+
+__device__ __forceinline__ void SIMDIZED_RSUM_8(const uint8_t* a_in_p, uint8_t* a_out_p, const uint8_t* a_base_p) {
+	[[maybe_unused]] auto out  = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto in   = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] auto base = reinterpret_cast<const uint32_t*>(a_base_p);
+
+	uint32_t i = threadIdx.x;
+	i          = i % 32;
+	uint32_t r_0;
+	uint32_t r_1;
+
+	r_0          = *(in + i + 0);
+	r_1          = *(base + i + 0);
+	r_1          = r_1 + r_0;
+	out[i + 0]   = r_1;
+	r_0          = *(in + i + 32);
+	r_1          = r_1 + r_0;
+	out[i + 32]  = r_1;
+	r_0          = *(in + i + 64);
+	r_1          = r_1 + r_0;
+	out[i + 64]  = r_1;
+	r_0          = *(in + i + 96);
+	r_1          = r_1 + r_0;
+	out[i + 96]  = r_1;
+	r_0          = *(in + i + 128);
+	r_1          = r_1 + r_0;
+	out[i + 128] = r_1;
+	r_0          = *(in + i + 160);
+	r_1          = r_1 + r_0;
+	out[i + 160] = r_1;
+	r_0          = *(in + i + 192);
+	r_1          = r_1 + r_0;
+	out[i + 192] = r_1;
+	r_0          = *(in + i + 224);
+	r_1          = r_1 + r_0;
+	out[i + 224] = r_1;
+}
+
+} // namespace NEW_IDEA
\ No newline at end of file
diff --git a/fastlanes/src/include/fls_gen/unpack/unpack.cuh b/fastlanes/src/include/fls_gen/unpack/unpack.cuh
new file mode 100644
index 0000000..8032106
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/unpack/unpack.cuh
@@ -0,0 +1,3451 @@
+// generated!
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace generated { namespace unpack::cuda { namespace normal {
+inline __device__ void unpack_0bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	out[(i * 1) + (0 * 32) + 0]   = base_0;
+	out[(i * 1) + (0 * 32) + 32]  = base_0;
+	out[(i * 1) + (0 * 32) + 64]  = base_0;
+	out[(i * 1) + (0 * 32) + 96]  = base_0;
+	out[(i * 1) + (0 * 32) + 128] = base_0;
+	out[(i * 1) + (0 * 32) + 160] = base_0;
+	out[(i * 1) + (0 * 32) + 192] = base_0;
+	out[(i * 1) + (0 * 32) + 224] = base_0;
+	out[(i * 1) + (0 * 32) + 256] = base_0;
+	out[(i * 1) + (0 * 32) + 288] = base_0;
+	out[(i * 1) + (0 * 32) + 320] = base_0;
+	out[(i * 1) + (0 * 32) + 352] = base_0;
+	out[(i * 1) + (0 * 32) + 384] = base_0;
+	out[(i * 1) + (0 * 32) + 416] = base_0;
+	out[(i * 1) + (0 * 32) + 448] = base_0;
+	out[(i * 1) + (0 * 32) + 480] = base_0;
+	out[(i * 1) + (0 * 32) + 512] = base_0;
+	out[(i * 1) + (0 * 32) + 544] = base_0;
+	out[(i * 1) + (0 * 32) + 576] = base_0;
+	out[(i * 1) + (0 * 32) + 608] = base_0;
+	out[(i * 1) + (0 * 32) + 640] = base_0;
+	out[(i * 1) + (0 * 32) + 672] = base_0;
+	out[(i * 1) + (0 * 32) + 704] = base_0;
+	out[(i * 1) + (0 * 32) + 736] = base_0;
+	out[(i * 1) + (0 * 32) + 768] = base_0;
+	out[(i * 1) + (0 * 32) + 800] = base_0;
+	out[(i * 1) + (0 * 32) + 832] = base_0;
+	out[(i * 1) + (0 * 32) + 864] = base_0;
+	out[(i * 1) + (0 * 32) + 896] = base_0;
+	out[(i * 1) + (0 * 32) + 928] = base_0;
+	out[(i * 1) + (0 * 32) + 960] = base_0;
+	out[(i * 1) + (0 * 32) + 992] = base_0;
+}
+inline __device__ void unpack_1bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_2bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0                         = (register_0) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_3bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_4bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0                         = (register_0) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0                         = (register_0) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0                         = (register_0) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_5bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_6bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0                         = (register_0) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_7bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 7) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 14) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 21) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_8bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0                         = (register_0) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_9bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 9) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 18) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_10bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 10) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 20) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0                         = (register_0) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_11bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 11) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_12bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0                         = (register_0) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0                         = (register_0) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0                         = (register_0) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_13bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 13) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 7) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_14bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 14) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 10) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0                         = (register_0) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_15bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                   = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                        = (register_0) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 0]  = tmp_0;
+	tmp_0                        = (register_0 >> 15) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 13) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 15) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_16bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 0]   = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 32]  = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0                         = (register_0) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_17bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 2) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_18bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 4) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0                         = (register_0) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_19bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 6) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 17;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 19) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_20bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0                         = (register_0) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0                         = (register_0) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0                         = (register_0) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_21bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 10) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 17;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 19;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 21) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_22bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 2) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0                         = (register_0) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_23bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 5) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 17;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 21;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 19;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 23) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_24bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 64]  = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0                         = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0                         = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0                         = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0                         = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0                         = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0                         = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0                         = (register_0) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_25bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 21;
+	out[(i * 1) + (0 * 32) + 96]  = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 17;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 23;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 19;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 25) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_26bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0                         = (register_0) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 26) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_27bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 27) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 25) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 25;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 27) - 1);
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 23;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 27) - 1);
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 21;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 26;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 27) - 1);
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 19;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 27) - 1);
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 17;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 27) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_28bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0                         = (register_0) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0                         = (register_0) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0                         = (register_0) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_29bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 29) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 21;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 27) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 27;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 29) - 1);
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 19;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 25) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 25;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 28;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 29) - 1);
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 17;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 23;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 896);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 26;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 29) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_30bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 30) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 30) - 1);
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0                         = (register_0) & ((1ULL << 30) - 1);
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 896);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 928);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 30) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_31bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0                       = (register_0) & ((1ULL << 31) - 1);
+	out[(i * 1) + (0 * 32) + 0] = tmp_0;
+	tmp_0                       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0                  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 1;
+	out[(i * 1) + (0 * 32) + 32] = tmp_0;
+	tmp_0                        = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 2;
+	out[(i * 1) + (0 * 32) + 64] = tmp_0;
+	tmp_0                        = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 3;
+	out[(i * 1) + (0 * 32) + 96] = tmp_0;
+	tmp_0                        = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0                   = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 4;
+	out[(i * 1) + (0 * 32) + 128] = tmp_0;
+	tmp_0                         = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 5;
+	out[(i * 1) + (0 * 32) + 160] = tmp_0;
+	tmp_0                         = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 6;
+	out[(i * 1) + (0 * 32) + 192] = tmp_0;
+	tmp_0                         = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 7;
+	out[(i * 1) + (0 * 32) + 224] = tmp_0;
+	tmp_0                         = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 8;
+	out[(i * 1) + (0 * 32) + 256] = tmp_0;
+	tmp_0                         = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 9;
+	out[(i * 1) + (0 * 32) + 288] = tmp_0;
+	tmp_0                         = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 10;
+	out[(i * 1) + (0 * 32) + 320] = tmp_0;
+	tmp_0                         = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 11;
+	out[(i * 1) + (0 * 32) + 352] = tmp_0;
+	tmp_0                         = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 12;
+	out[(i * 1) + (0 * 32) + 384] = tmp_0;
+	tmp_0                         = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 13;
+	out[(i * 1) + (0 * 32) + 416] = tmp_0;
+	tmp_0                         = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 14;
+	out[(i * 1) + (0 * 32) + 448] = tmp_0;
+	tmp_0                         = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 15;
+	out[(i * 1) + (0 * 32) + 480] = tmp_0;
+	tmp_0                         = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 16;
+	out[(i * 1) + (0 * 32) + 512] = tmp_0;
+	tmp_0                         = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 17;
+	out[(i * 1) + (0 * 32) + 544] = tmp_0;
+	tmp_0                         = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 18;
+	out[(i * 1) + (0 * 32) + 576] = tmp_0;
+	tmp_0                         = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 19;
+	out[(i * 1) + (0 * 32) + 608] = tmp_0;
+	tmp_0                         = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 20;
+	out[(i * 1) + (0 * 32) + 640] = tmp_0;
+	tmp_0                         = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 21;
+	out[(i * 1) + (0 * 32) + 672] = tmp_0;
+	tmp_0                         = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 22;
+	out[(i * 1) + (0 * 32) + 704] = tmp_0;
+	tmp_0                         = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 23;
+	out[(i * 1) + (0 * 32) + 736] = tmp_0;
+	tmp_0                         = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 24;
+	out[(i * 1) + (0 * 32) + 768] = tmp_0;
+	tmp_0                         = (register_0 >> 7) & ((1ULL << 25) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 25;
+	out[(i * 1) + (0 * 32) + 800] = tmp_0;
+	tmp_0                         = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 26;
+	out[(i * 1) + (0 * 32) + 832] = tmp_0;
+	tmp_0                         = (register_0 >> 5) & ((1ULL << 27) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 27;
+	out[(i * 1) + (0 * 32) + 864] = tmp_0;
+	tmp_0                         = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 896);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 28;
+	out[(i * 1) + (0 * 32) + 896] = tmp_0;
+	tmp_0                         = (register_0 >> 3) & ((1ULL << 29) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 928);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 29;
+	out[(i * 1) + (0 * 32) + 928] = tmp_0;
+	tmp_0                         = (register_0 >> 2) & ((1ULL << 30) - 1);
+	register_0                    = *(in + (0 * 32) + (i * 1) + 960);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 30;
+	out[(i * 1) + (0 * 32) + 960] = tmp_0;
+	tmp_0                         = (register_0 >> 1) & ((1ULL << 31) - 1);
+	out[(i * 1) + (0 * 32) + 992] = tmp_0;
+}
+inline __device__ void unpack_32bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0                    = *(in + (0 * 32) + (i * 1) + 0);
+	out[(i * 1) + (0 * 32) + 0]   = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 32);
+	out[(i * 1) + (0 * 32) + 32]  = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 64);
+	out[(i * 1) + (0 * 32) + 64]  = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 96);
+	out[(i * 1) + (0 * 32) + 96]  = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 128);
+	out[(i * 1) + (0 * 32) + 128] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 160);
+	out[(i * 1) + (0 * 32) + 160] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 192);
+	out[(i * 1) + (0 * 32) + 192] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 224);
+	out[(i * 1) + (0 * 32) + 224] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 256);
+	out[(i * 1) + (0 * 32) + 256] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 288);
+	out[(i * 1) + (0 * 32) + 288] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 320);
+	out[(i * 1) + (0 * 32) + 320] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 352);
+	out[(i * 1) + (0 * 32) + 352] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 384);
+	out[(i * 1) + (0 * 32) + 384] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 416);
+	out[(i * 1) + (0 * 32) + 416] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 448);
+	out[(i * 1) + (0 * 32) + 448] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 480);
+	out[(i * 1) + (0 * 32) + 480] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 512);
+	out[(i * 1) + (0 * 32) + 512] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 544);
+	out[(i * 1) + (0 * 32) + 544] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 576);
+	out[(i * 1) + (0 * 32) + 576] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 608);
+	out[(i * 1) + (0 * 32) + 608] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 640);
+	out[(i * 1) + (0 * 32) + 640] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 672);
+	out[(i * 1) + (0 * 32) + 672] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 704);
+	out[(i * 1) + (0 * 32) + 704] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 736);
+	out[(i * 1) + (0 * 32) + 736] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 768);
+	out[(i * 1) + (0 * 32) + 768] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 800);
+	out[(i * 1) + (0 * 32) + 800] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 832);
+	out[(i * 1) + (0 * 32) + 832] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 864);
+	out[(i * 1) + (0 * 32) + 864] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 896);
+	out[(i * 1) + (0 * 32) + 896] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 928);
+	out[(i * 1) + (0 * 32) + 928] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 960);
+	out[(i * 1) + (0 * 32) + 960] = register_0;
+	register_0                    = *(in + (0 * 32) + (i * 1) + 992);
+	out[(i * 1) + (0 * 32) + 992] = register_0;
+}
+inline __device__ void unpack(const uint32_t* a_in_p, uint32_t* a_out_p, uint8_t bw) {
+	switch (bw) {
+	case 0:
+		unpack_0bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 1:
+		unpack_1bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 2:
+		unpack_2bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 3:
+		unpack_3bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 4:
+		unpack_4bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 5:
+		unpack_5bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 6:
+		unpack_6bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 7:
+		unpack_7bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 8:
+		unpack_8bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 9:
+		unpack_9bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 10:
+		unpack_10bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 11:
+		unpack_11bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 12:
+		unpack_12bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 13:
+		unpack_13bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 14:
+		unpack_14bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 15:
+		unpack_15bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 16:
+		unpack_16bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 17:
+		unpack_17bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 18:
+		unpack_18bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 19:
+		unpack_19bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 20:
+		unpack_20bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 21:
+		unpack_21bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 22:
+		unpack_22bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 23:
+		unpack_23bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 24:
+		unpack_24bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 25:
+		unpack_25bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 26:
+		unpack_26bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 27:
+		unpack_27bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 28:
+		unpack_28bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 29:
+		unpack_29bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 30:
+		unpack_30bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 31:
+		unpack_31bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 32:
+		unpack_32bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	}
+}
+
+}}}; // namespace generated::unpack::cuda::normal
+
+__global__ void unpack_global(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	int blc_idx = blockIdx.x;
+	in          = in + ((blc_idx * bw) << 5);
+	out         = out + (blc_idx << 10);
+
+	generated::unpack::cuda::normal::unpack(in, out, bw);
+}
+
+__device__ __forceinline__ void unpack_device(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	generated::unpack::cuda::normal::unpack(in, out, bw);
+}
+
+__device__ __forceinline__ void unpack_device(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw) {
+	generated::unpack::cuda::normal::unpack(
+	    reinterpret_cast<const uint32_t*>(in), reinterpret_cast<uint32_t*>(out), bw);
+}
diff --git a/fastlanes/src/include/fls_gen/unpack/unpack.hpp b/fastlanes/src/include/fls_gen/unpack/unpack.hpp
new file mode 100644
index 0000000..bd5b2f5
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/unpack/unpack.hpp
@@ -0,0 +1,13 @@
+#ifndef BITPACK_BITPACK_HPP
+#define BITPACK_BITPACK_HPP
+
+#include <cstdint>
+
+namespace generated::unpack { namespace fallback::scalar {
+void unpack(const uint64_t* __restrict in, uint64_t* __restrict out, uint8_t bw);
+void unpack(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw);
+void unpack(const uint16_t* __restrict in, uint16_t* __restrict out, uint8_t bw);
+void unpack(const uint8_t* __restrict in, uint8_t* __restrict out, uint8_t bw);
+}} // namespace generated::unpack::fallback::scalar
+
+#endif
diff --git a/fastlanes/src/include/fls_gen/unpack/unpack_fused.cuh b/fastlanes/src/include/fls_gen/unpack/unpack_fused.cuh
new file mode 100644
index 0000000..55ad133
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/unpack/unpack_fused.cuh
@@ -0,0 +1,3461 @@
+// generated!
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace generated { namespace unpack::cuda { namespace fused {
+__device__ __forceinline__ void unpack_0bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	// int i = threadIdx.x; // THREAD INDEX
+
+	out[0]  = base_0;
+	out[1]  = base_0;
+	out[2]  = base_0;
+	out[3]  = base_0;
+	out[4]  = base_0;
+	out[5]  = base_0;
+	out[6]  = base_0;
+	out[7]  = base_0;
+	out[8]  = base_0;
+	out[9]  = base_0;
+	out[10] = base_0;
+	out[11] = base_0;
+	out[12] = base_0;
+	out[13] = base_0;
+	out[14] = base_0;
+	out[15] = base_0;
+	out[16] = base_0;
+	out[17] = base_0;
+	out[18] = base_0;
+	out[19] = base_0;
+	out[20] = base_0;
+	out[21] = base_0;
+	out[22] = base_0;
+	out[23] = base_0;
+	out[24] = base_0;
+	out[25] = base_0;
+	out[26] = base_0;
+	out[27] = base_0;
+	out[28] = base_0;
+	out[29] = base_0;
+	out[30] = base_0;
+	out[31] = base_0;
+}
+__device__ __forceinline__ void unpack_1bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0       = (register_0) & ((1ULL << 1) - 1);
+	out[0]  = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 1) - 1);
+	out[1]  = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 1) - 1);
+	out[2]  = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 1) - 1);
+	out[3]  = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 1) - 1);
+	out[4]  = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 1) - 1);
+	out[5]  = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 1) - 1);
+	out[6]  = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 1) - 1);
+	out[7]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 1) - 1);
+	out[8]  = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 1) - 1);
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 1) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 1) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 1) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 1) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 1) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 1) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 1) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 1) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 1) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 1) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 1) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 1) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 1) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 1) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 1) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 1) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 1) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 1) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 1) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 1) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 1) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_2bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0       = (register_0) & ((1ULL << 2) - 1);
+	out[0]  = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 2) - 1);
+	out[1]  = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 2) - 1);
+	out[2]  = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 2) - 1);
+	out[3]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 2) - 1);
+	out[4]  = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 2) - 1);
+	out[5]  = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 2) - 1);
+	out[6]  = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 2) - 1);
+	out[7]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 2) - 1);
+	out[8]  = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 2) - 1);
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 2) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 2) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 2) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 2) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 2) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0       = (register_0) & ((1ULL << 2) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 2) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 2) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 2) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 2) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 2) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 2) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 2) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 2) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 2) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 2) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 2) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 2) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 2) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 2) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_3bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 3) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 3) & ((1ULL << 3) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 3) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 9) & ((1ULL << 3) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 3) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 15) & ((1ULL << 3) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 3) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 21) & ((1ULL << 3) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 3) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 27) & ((1ULL << 3) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 3) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 3) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 3) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 3) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 3) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 3) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 3) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 3) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 3) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 3) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 3) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 3) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 3) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 3) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 3) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 3) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 3) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 3) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 3) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_4bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0       = (register_0) & ((1ULL << 4) - 1);
+	out[0]  = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[1]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[2]  = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[3]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[4]  = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[5]  = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[6]  = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[7]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0       = (register_0) & ((1ULL << 4) - 1);
+	out[8]  = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0       = (register_0) & ((1ULL << 4) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[23] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0       = (register_0) & ((1ULL << 4) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 4) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 4) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 4) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 4) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 4) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 4) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_5bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 5) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 5) & ((1ULL << 5) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 5) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 15) & ((1ULL << 5) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 5) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 25) & ((1ULL << 5) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2;
+	out[6]  = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 5) - 1);
+	out[7]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 5) - 1);
+	out[8]  = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 5) - 1);
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 5) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 5) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 5) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 5) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 5) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 5) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 5) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 5) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 5) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 5) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 5) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 5) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 5) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 5) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 5) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 5) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 5) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 5) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_6bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 6) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0       = (register_0) & ((1ULL << 6) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 6) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 6) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 6) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 6) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 6) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 6) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 6) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 6) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 6) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 6) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 6) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 6) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_7bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 7) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 7) & ((1ULL << 7) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 7) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 21) & ((1ULL << 7) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 3) & ((1ULL << 7) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 7) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 17) & ((1ULL << 7) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 7) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 7) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 7) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 7) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 7) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 7) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 7) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 7) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 7) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 7) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 7) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 7) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 7) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 7) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 7) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 7) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 7) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 7) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_8bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[0]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[1]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[2]  = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[3]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[4]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[5]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[6]  = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[7]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[8]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[11] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[19] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[23] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[27] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0       = (register_0) & ((1ULL << 8) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 8) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 8) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_9bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 9) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 9) & ((1ULL << 9) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 9) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 9) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 13) & ((1ULL << 9) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 9) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 9) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 17) & ((1ULL << 9) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 9) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 9) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 9) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 9) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 9) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 9) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 9) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 9) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 9) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 9) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 9) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 9) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 9) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 9) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 9) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_10bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 10) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 10) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 10) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 10) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 10) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 10) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 10) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 10) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 10) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 10) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 10) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0       = (register_0) & ((1ULL << 10) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 10) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 10) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 10) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 10) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 10) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 10) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 10) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 10) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 10) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 10) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_11bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 11) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 11) & ((1ULL << 11) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 1) & ((1ULL << 11) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 11) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 11) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 13) & ((1ULL << 11) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8;
+	out[8]  = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 11) - 1);
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 11) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 11) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 11) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 11) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 11) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 11) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 11) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 11) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 11) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 11) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 11) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 11) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 11) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 11) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_12bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 12) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[7] = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 12) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0       = (register_0) & ((1ULL << 12) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[23] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0       = (register_0) & ((1ULL << 12) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 12) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 12) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 12) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 12) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_13bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 13) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 13) & ((1ULL << 13) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 7) & ((1ULL << 13) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 1) & ((1ULL << 13) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 13) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 13) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 13) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 13) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 13) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 13) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 13) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 13) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 13) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 13) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 13) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 13) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 13) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 13) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 13) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 13) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_14bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 14) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 14) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 14) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 14) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 14) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 14) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 14) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 14) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 14) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0       = (register_0) & ((1ULL << 14) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 14) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 14) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 14) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 14) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 14) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 14) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 14) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 14) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_15bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 15) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 15) & ((1ULL << 15) - 1);
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 13) & ((1ULL << 15) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 11) & ((1ULL << 15) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 9) & ((1ULL << 15) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 7) & ((1ULL << 15) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 15) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 15) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 15) - 1);
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 15) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 15) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 15) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 15) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 15) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 15) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 15) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 15) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 15) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_16bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0  = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[0]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[1]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[2]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[3]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[4]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[5]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[6]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[7]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[8]  = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[9]  = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[11] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[13] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[17] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[19] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[21] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[23] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[25] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[27] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[29] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0       = (register_0) & ((1ULL << 16) - 1);
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_17bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 17) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 15;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 17) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 13;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 17) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 11;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 17) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 9;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 17) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 7;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 17) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 5;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 17) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 3;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 17) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 1;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 17) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 14;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 17) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 12;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 17) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 10;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 17) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 8;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 17) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 6;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 17) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 4;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 17) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 2;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 17) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_18bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 18) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 18) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 18) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 18) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 18) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 18) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 18) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0       = (register_0) & ((1ULL << 18) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 18) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 18) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 18) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 18) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 18) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 18) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_19bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 19) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 13;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 19) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 7;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 19) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 1;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 14;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 5) & ((1ULL << 19) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 8;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 11) & ((1ULL << 19) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 2;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 15;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 19) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 9;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 19) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 3;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 19) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 10;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 19) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 4;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 17;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 19) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 11;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 19) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 5;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 18;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 19) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 12;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 19) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 6;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 19) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_20bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 20) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[7] = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0      = (register_0) & ((1ULL << 20) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0       = (register_0) & ((1ULL << 20) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[23] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0       = (register_0) & ((1ULL << 20) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 20) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 20) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_21bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 21) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 11;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 21) - 1);
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 1;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 12;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 9) & ((1ULL << 21) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 2;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 13;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 21) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 3;
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 14;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 21) - 1);
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 4;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 15;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 21) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 5;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 21) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 6;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 17;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 21) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 7;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 18;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 21) - 1);
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 8;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 19;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 21) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 9;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 20;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 21) - 1);
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 10;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 21) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_22bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 22) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 22) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 22) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 6) & ((1ULL << 22) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 22) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0       = (register_0) & ((1ULL << 22) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 22) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 22) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 22) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 22) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_23bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 23) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 9;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 18;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 5) & ((1ULL << 23) - 1);
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 4;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 13;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 22;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 1) & ((1ULL << 23) - 1);
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 8;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 17;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 23) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 3;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 12;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 21;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 23) - 1);
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 7;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 23) - 1);
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 2;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 11;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 20;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 23) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 6;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 15;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 23) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 1;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 10;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 19;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 23) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 5;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 14;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 23) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_24bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[3] = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[7] = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0      = (register_0) & ((1ULL << 24) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[11] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0       = (register_0) & ((1ULL << 24) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0       = (register_0) & ((1ULL << 24) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[19] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0       = (register_0) & ((1ULL << 24) - 1);
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[23] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0       = (register_0) & ((1ULL << 24) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[27] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0       = (register_0) & ((1ULL << 24) - 1);
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_25bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 25) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 7;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 14;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 21;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 25) - 1);
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 3;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 10;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 17;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 24;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 1) & ((1ULL << 25) - 1);
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 6;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 13;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 20;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 25) - 1);
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 2;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 9;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 23;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 25) - 1);
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 5;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 12;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 19;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 25) - 1);
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 1;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 8;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 15;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 22;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 25) - 1);
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 4;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 11;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 18;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 25) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_26bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 26) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 26) - 1);
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 26) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 26) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0       = (register_0) & ((1ULL << 26) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 26) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 26) - 1);
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 26) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_27bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 27) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 5;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 10;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 15;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 20;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 7) & ((1ULL << 25) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 25;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 2) & ((1ULL << 27) - 1);
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 3;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 8;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 13;
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 18;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 23;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 27) - 1);
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 1;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 6;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 11;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 21;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 26;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 27) - 1);
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 4;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 9;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 14;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 19;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 24;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 27) - 1);
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 2;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 7;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 12;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 17;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 22;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 27) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_28bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 28) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[7] = tmp_0;
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0      = (register_0) & ((1ULL << 28) - 1);
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0       = (register_0) & ((1ULL << 28) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[23] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0       = (register_0) & ((1ULL << 28) - 1);
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 28) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_29bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 29) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 3;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 6;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 9;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 12;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 15;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 18;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 21;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 24;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 5) & ((1ULL << 27) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 27;
+	out[9]  = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 29) - 1);
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 1;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 4;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 7;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 10;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 13;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 19;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 22;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 25) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 25;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 28;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 29) - 1);
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 2;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 5;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 8;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 11;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 14;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 17;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 20;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 23;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 896);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 26;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 29) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_30bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 30) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18;
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 30) - 1);
+	out[15] = tmp_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0       = (register_0) & ((1ULL << 30) - 1);
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 896);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 928);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 30) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_31bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0 = *(in + (0 * 32) + (i * 1) + 0);
+	tmp_0      = (register_0) & ((1ULL << 31) - 1);
+	out[0] = tmp_0;
+	tmp_0      = (register_0 >> 31) & ((1ULL << 1) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 32);
+	tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 1;
+	out[1] = tmp_0;
+	tmp_0      = (register_0 >> 30) & ((1ULL << 2) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 64);
+	tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 2;
+	out[2] = tmp_0;
+	tmp_0      = (register_0 >> 29) & ((1ULL << 3) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 96);
+	tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 3;
+	out[3] = tmp_0;
+	tmp_0      = (register_0 >> 28) & ((1ULL << 4) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 128);
+	tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 4;
+	out[4] = tmp_0;
+	tmp_0      = (register_0 >> 27) & ((1ULL << 5) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 160);
+	tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 5;
+	out[5] = tmp_0;
+	tmp_0      = (register_0 >> 26) & ((1ULL << 6) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 192);
+	tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 6;
+	out[6] = tmp_0;
+	tmp_0      = (register_0 >> 25) & ((1ULL << 7) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 224);
+	tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 7;
+	out[7] = tmp_0;
+	tmp_0      = (register_0 >> 24) & ((1ULL << 8) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 256);
+	tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 8;
+	out[8] = tmp_0;
+	tmp_0      = (register_0 >> 23) & ((1ULL << 9) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 288);
+	tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 9;
+	out[9] = tmp_0;
+	tmp_0      = (register_0 >> 22) & ((1ULL << 10) - 1);
+	register_0 = *(in + (0 * 32) + (i * 1) + 320);
+	tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 10;
+	out[10] = tmp_0;
+	tmp_0       = (register_0 >> 21) & ((1ULL << 11) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 11;
+	out[11] = tmp_0;
+	tmp_0       = (register_0 >> 20) & ((1ULL << 12) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 12;
+	out[12] = tmp_0;
+	tmp_0       = (register_0 >> 19) & ((1ULL << 13) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 13;
+	out[13] = tmp_0;
+	tmp_0       = (register_0 >> 18) & ((1ULL << 14) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 14;
+	out[14] = tmp_0;
+	tmp_0       = (register_0 >> 17) & ((1ULL << 15) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 15;
+	out[15] = tmp_0;
+	tmp_0       = (register_0 >> 16) & ((1ULL << 16) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 16;
+	out[16] = tmp_0;
+	tmp_0       = (register_0 >> 15) & ((1ULL << 17) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 17;
+	out[17] = tmp_0;
+	tmp_0       = (register_0 >> 14) & ((1ULL << 18) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 18;
+	out[18] = tmp_0;
+	tmp_0       = (register_0 >> 13) & ((1ULL << 19) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 19;
+	out[19] = tmp_0;
+	tmp_0       = (register_0 >> 12) & ((1ULL << 20) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 20;
+	out[20] = tmp_0;
+	tmp_0       = (register_0 >> 11) & ((1ULL << 21) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 21;
+	out[21] = tmp_0;
+	tmp_0       = (register_0 >> 10) & ((1ULL << 22) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 22;
+	out[22] = tmp_0;
+	tmp_0       = (register_0 >> 9) & ((1ULL << 23) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 23;
+	out[23] = tmp_0;
+	tmp_0       = (register_0 >> 8) & ((1ULL << 24) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 24;
+	out[24] = tmp_0;
+	tmp_0       = (register_0 >> 7) & ((1ULL << 25) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 800);
+	tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 25;
+	out[25] = tmp_0;
+	tmp_0       = (register_0 >> 6) & ((1ULL << 26) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 832);
+	tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 26;
+	out[26] = tmp_0;
+	tmp_0       = (register_0 >> 5) & ((1ULL << 27) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 864);
+	tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 27;
+	out[27] = tmp_0;
+	tmp_0       = (register_0 >> 4) & ((1ULL << 28) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 896);
+	tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 28;
+	out[28] = tmp_0;
+	tmp_0       = (register_0 >> 3) & ((1ULL << 29) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 928);
+	tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 29;
+	out[29] = tmp_0;
+	tmp_0       = (register_0 >> 2) & ((1ULL << 30) - 1);
+	register_0  = *(in + (0 * 32) + (i * 1) + 960);
+	tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 30;
+	out[30] = tmp_0;
+	tmp_0       = (register_0 >> 1) & ((1ULL << 31) - 1);
+	out[31] = tmp_0;
+}
+__device__ __forceinline__ void unpack_32bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	[[maybe_unused]] auto     out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] auto     in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t register_0;
+	[[maybe_unused]] uint32_t tmp_0;
+	[[maybe_unused]] uint32_t base_0 = 0ULL;
+
+	int i = threadIdx.x; // THREAD INDEX
+
+	register_0  = *(in + (0 * 32) + (i * 1) + 0);
+	out[0]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 32);
+	out[1]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 64);
+	out[2]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 96);
+	out[3]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 128);
+	out[4]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 160);
+	out[5]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 192);
+	out[6]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 224);
+	out[7]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 256);
+	out[8]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 288);
+	out[9]  = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 320);
+	out[10] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 352);
+	out[11] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 384);
+	out[12] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 416);
+	out[13] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 448);
+	out[14] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 480);
+	out[15] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 512);
+	out[16] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 544);
+	out[17] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 576);
+	out[18] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 608);
+	out[19] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 640);
+	out[20] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 672);
+	out[21] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 704);
+	out[22] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 736);
+	out[23] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 768);
+	out[24] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 800);
+	out[25] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 832);
+	out[26] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 864);
+	out[27] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 896);
+	out[28] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 928);
+	out[29] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 960);
+	out[30] = register_0;
+	register_0  = *(in + (0 * 32) + (i * 1) + 992);
+	out[31] = register_0;
+}
+__device__ __forceinline__ void unpack(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p, uint8_t bw) {
+	switch (bw) {
+	case 0:
+		unpack_0bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 1:
+		unpack_1bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 2:
+		unpack_2bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 3:
+		unpack_3bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 4:
+		unpack_4bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 5:
+		unpack_5bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 6:
+		unpack_6bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 7:
+		unpack_7bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 8:
+		unpack_8bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 9:
+		unpack_9bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 10:
+		unpack_10bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 11:
+		unpack_11bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 12:
+		unpack_12bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 13:
+		unpack_13bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 14:
+		unpack_14bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 15:
+		unpack_15bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 16:
+		unpack_16bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 17:
+		unpack_17bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 18:
+		unpack_18bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 19:
+		unpack_19bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 20:
+		unpack_20bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 21:
+		unpack_21bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 22:
+		unpack_22bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 23:
+		unpack_23bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 24:
+		unpack_24bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 25:
+		unpack_25bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 26:
+		unpack_26bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 27:
+		unpack_27bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 28:
+		unpack_28bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 29:
+		unpack_29bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 30:
+		unpack_30bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 31:
+		unpack_31bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	case 32:
+		unpack_32bw_32ow_32crw_1uf(a_in_p, a_out_p);
+		break;
+	}
+}
+
+}}}; // namespace generated::unpack::cuda::fused
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__  __forceinline__ void load_registers(int i, T* out, T* registers) {
+
+#pragma unroll
+	for (int j = 0; j < ITEMS_PER_THREAD; j++) {
+		out[j * BLOCK_THREADS + i] = registers[j];
+	}
+}
+
+__global__ void unpack_global(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	int trd_idx = threadIdx.x;
+	int blc_idx = blockIdx.x;
+	in          = in + ((blc_idx * bw) << 5);
+	out         = out + (blc_idx << 10);
+
+	uint32_t registers[32];
+	generated::unpack::cuda::fused::unpack(in, registers, bw);
+	load_registers<uint32_t, 32, 32>(trd_idx, out, registers);
+}
+
+__device__  __forceinline__ void unpack_device(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	generated::unpack::cuda::fused::unpack(in, out, bw);
+}
+
+__device__  __forceinline__ void unpack_device(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw) {
+	generated::unpack::cuda::fused::unpack(reinterpret_cast<const uint32_t*>(in), reinterpret_cast<uint32_t*>(out), bw);
+}
diff --git a/fastlanes/src/include/fls_gen/unrsum/unrsum.hpp b/fastlanes/src/include/fls_gen/unrsum/unrsum.hpp
new file mode 100644
index 0000000..8927d36
--- /dev/null
+++ b/fastlanes/src/include/fls_gen/unrsum/unrsum.hpp
@@ -0,0 +1,58 @@
+#ifndef FLS_GEN_UNRSUM_UNRSUM_HPP
+#define FLS_GEN_UNRSUM_UNRSUM_HPP
+
+#include <cstdint>
+
+namespace generated { namespace unrsum {
+namespace fallback { namespace scalar {
+void unrsum(const uint8_t* in, uint8_t* out);
+void unrsum(const uint16_t* in, uint16_t* out);
+void unrsum(const uint32_t* in, uint32_t* out);
+void unrsum(const uint64_t* in, uint64_t* out);
+void unrsum_inplace(uint8_t* in);
+void unrsum_inplace(uint16_t* in);
+void unrsum_inplace(uint32_t* in);
+void unrsum_inplace(uint64_t* in);
+}} // namespace fallback::scalar
+
+namespace helper { namespace scalar {
+void unrsum(const uint8_t* in, uint8_t* out);
+void unrsum(const uint16_t* in, uint16_t* out);
+void unrsum(const uint32_t* in, uint32_t* out);
+void unrsum(const uint64_t* in, uint64_t* out);
+void unrsum_inplace(uint8_t* in);
+void unrsum_inplace(uint16_t* in);
+void unrsum_inplace(uint32_t* in);
+void unrsum_inplace(uint64_t* in);
+}} // namespace helper::scalar
+
+namespace neon {
+void unrsum(const uint8_t* in, uint8_t* out);
+void unrsum(const uint16_t* in, uint16_t* out);
+void unrsum(const uint32_t* in, uint32_t* out);
+void unrsum(const uint64_t* in, uint64_t* out);
+} // namespace neon
+
+namespace avx2 {
+void unrsum(const uint8_t* in, uint8_t* out);
+void unrsum(const uint16_t* in, uint16_t* out);
+void unrsum(const uint32_t* in, uint32_t* out);
+void unrsum(const uint64_t* in, uint64_t* out);
+} // namespace avx2
+
+namespace sse {
+void unrsum(const uint8_t* in, uint8_t* out);
+void unrsum(const uint16_t* in, uint16_t* out);
+void unrsum(const uint32_t* in, uint32_t* out);
+void unrsum(const uint64_t* in, uint64_t* out);
+} // namespace sse
+
+namespace avx512f {
+void unrsum(const uint8_t* in, uint8_t* out);
+void unrsum(const uint16_t* in, uint16_t* out);
+void unrsum(const uint32_t* in, uint32_t* out);
+void unrsum(const uint64_t* in, uint64_t* out);
+} // namespace avx512f
+}} // namespace generated::unrsum
+
+#endif
diff --git a/fastlanes/src/include/gpu_utils.h b/fastlanes/src/include/gpu_utils.h
new file mode 100644
index 0000000..f6eb9bc
--- /dev/null
+++ b/fastlanes/src/include/gpu_utils.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "common.cuh"
+#include "error.cuh"
+#include <cub/util_allocator.cuh>
+
+#define SETUP_TIMING()                                                                                                 \
+	cudaEvent_t start, stop;                                                                                           \
+	cudaEventCreate(&start);                                                                                           \
+	cudaEventCreate(&stop);
+
+#define TIME_FUNC(f, t)                                                                                                \
+	{                                                                                                                  \
+		cudaEventRecord(start, 0);                                                                                     \
+		f;                                                                                                             \
+		cudaEventRecord(stop, 0);                                                                                      \
+		cudaEventSynchronize(stop);                                                                                    \
+		cudaEventElapsedTime(&t, start, stop);                                                                         \
+	}
+
+#define CLEANUP(vec)                                                                                                   \
+	if (vec) CubDebugExit(fastlanes::gpu::g_allocator.DeviceFree(vec))
+
+#define ALLOCATE(vec, size) CubDebugExit(g_allocator.DeviceAllocate((void**)&vec, size))
+
+template <typename T>
+T* loadToGPU(const T* src, int numEntries, cub::CachingDeviceAllocator& g_allocator) {
+	T* dest;
+	CHECK_CUDA_ERROR(g_allocator.DeviceAllocate((void**)&dest, sizeof(T) * numEntries));
+	CHECK_CUDA_ERROR(cudaMemcpy(dest, src, sizeof(T) * numEntries, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+inline void* gpu_load(const void* src, bsz_t size, cub::CachingDeviceAllocator& g_allocator) {
+	void* dest;
+	CHECK_CUDA_ERROR(g_allocator.DeviceAllocate((void**)&dest, size));
+	CHECK_CUDA_ERROR(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+#define CHECK_ERROR()                                                                                                  \
+	{                                                                                                                  \
+		cudaDeviceSynchronize();                                                                                       \
+		cudaError_t error = cudaGetLastError();                                                                        \
+		if (error != cudaSuccess) {                                                                                    \
+			printf("CUDA error: %s\n", cudaGetErrorString(error));                                                     \
+			exit(-1);                                                                                                  \
+		}                                                                                                              \
+	}
+
+#define CUDA_SAFE_CALL(call)                                                                                           \
+	do {                                                                                                               \
+		cudaError_t err = call;                                                                                        \
+		if (cudaSuccess != err) {                                                                                      \
+			fprintf(stderr, "Cuda error in file '%s' in line %i : %s.", __FILE__, __LINE__, cudaGetErrorString(err));  \
+			exit(EXIT_FAILURE);                                                                                        \
+		}                                                                                                              \
+	} while (0)
+
+namespace fastlanes::gpu {
+inline void* load_arr(void* src, uint64_t bsz) {
+	void* dest = nullptr;
+	cudaMalloc((void**)&dest, bsz);
+	cudaMemcpy(dest, src, bsz, cudaMemcpyHostToDevice);
+	return dest;
+}
+
+template <typename T>
+inline T* load_arr(T* src, uint64_t bsz) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, bsz);
+	cudaMemcpy(dest, src, bsz, cudaMemcpyHostToDevice);
+	return dest;
+}
+
+template <typename T>
+inline T* load_to_gpu(const T* src, bsz_t size, cub::CachingDeviceAllocator& g_allocator) {
+	T* dest;
+	CHECK_CUDA_ERROR(g_allocator.DeviceAllocate((void**)&dest, size));
+	CHECK_CUDA_ERROR(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+inline void* load_to_gpu(const void* src, bsz_t size, cub::CachingDeviceAllocator& g_allocator) {
+	void* dest;
+	CHECK_CUDA_ERROR(g_allocator.DeviceAllocate((void**)&dest, size));
+	CHECK_CUDA_ERROR(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+inline cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+} // namespace fastlanes::gpu
\ No newline at end of file
diff --git a/fastlanes/src/include/query/query_2.hpp b/fastlanes/src/include/query/query_2.hpp
new file mode 100644
index 0000000..ee06a56
--- /dev/null
+++ b/fastlanes/src/include/query/query_2.hpp
@@ -0,0 +1,37 @@
+#ifndef QUERY_2_HPP
+#define QUERY_2_HPP
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <vector>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+namespace fastlanes::ssb {
+
+struct SSBQuery2ResultRow {
+	int                col_0;
+	int                col_1;
+	unsigned long long col_2;
+
+	bool operator==(const SSBQuery2ResultRow& rhs) const {
+		return (col_0 == rhs.col_0) && (col_1 == rhs.col_1) && (col_2 == rhs.col_2);
+	}
+	SSBQuery2ResultRow(int col_0, int col_1, unsigned long long col_2)
+	    : col_0(col_0)
+	    , col_1(col_1)
+	    , col_2(col_2) {}
+};
+
+using SSBQuery2ResultTable = std::vector<SSBQuery2ResultRow>;
+
+struct SSBQuery2 {
+	const SSBQuery2ResultTable& reuslt;
+	const SSB&                  ssb;
+};
+
+} // namespace fastlanes::ssb
+
+#endif // QUERY_2_HPP
diff --git a/fastlanes/src/include/query/query_21.hpp b/fastlanes/src/include/query/query_21.hpp
new file mode 100644
index 0000000..1699730
--- /dev/null
+++ b/fastlanes/src/include/query/query_21.hpp
@@ -0,0 +1,295 @@
+#ifndef QUERY_21_HPP
+#define QUERY_21_HPP
+
+#include "query/query_2.hpp"
+
+namespace fastlanes::ssb {
+
+inline SSBQuery2ResultTable ssb_q21_10_result_table {
+    {1992, 40, 6418103797}, //
+    {1993, 40, 6382034658}, //
+    {1994, 40, 6525665927}, //
+    {1995, 40, 6563646251}, //
+    {1996, 40, 6578456138}, //
+    {1997, 40, 6379340748}, //
+    {1998, 40, 3698210388}, //
+    {1992, 41, 7215774624}, //
+    {1993, 41, 7235770546}, //
+    {1994, 41, 7093880636}, //
+    {1995, 41, 7156912379}, //
+    {1996, 41, 7158257544}, //
+    {1997, 41, 6890416180}, //
+    {1998, 41, 4204327203}, //
+    {1992, 42, 6601010229}, //
+    {1993, 42, 6661380481}, //
+    {1994, 42, 6805618335}, //
+    {1995, 42, 6539366376}, //
+    {1996, 42, 6832348876}, //
+    {1997, 42, 6571540214}, //
+    {1998, 42, 3978744872}, //
+    {1992, 43, 7197665271}, //
+    {1993, 43, 6945853876}, //
+    {1994, 43, 6916324003}, //
+    {1995, 43, 6871377174}, //
+    {1996, 43, 6832025455}, //
+    {1997, 43, 7045010342}, //
+    {1998, 43, 4258248122}, //
+    {1992, 44, 6727625998}, //
+    {1993, 44, 6312893492}, //
+    {1994, 44, 6562962487}, //
+    {1995, 44, 6229499393}, //
+    {1996, 44, 6564940527}, //
+    {1997, 44, 6510029432}, //
+    {1998, 44, 3893760987}, //
+    {1992, 45, 6461380401}, //
+    {1993, 45, 6558772163}, //
+    {1994, 45, 6456432352}, //
+    {1995, 45, 6239560871}, //
+    {1996, 45, 6519111073}, //
+    {1997, 45, 6390521469}, //
+    {1998, 45, 3755042585}, //
+    {1992, 46, 7012618906}, //
+    {1993, 46, 6783420789}, //
+    {1994, 46, 6677778187}, //
+    {1995, 46, 7063055884}, //
+    {1996, 46, 6717830860}, //
+    {1997, 46, 6825894334}, //
+    {1998, 46, 4019728654}, //
+    {1992, 47, 6730883299}, //
+    {1993, 47, 6673551973}, //
+    {1994, 47, 6889765819}, //
+    {1995, 47, 6873007289}, //
+    {1996, 47, 6691029370}, //
+    {1997, 47, 6656963454}, //
+    {1998, 47, 3860509210}, //
+    {1992, 48, 6125452526}, //
+    {1993, 48, 6387782986}, //
+    {1994, 48, 6220999101}, //
+    {1995, 48, 6266773366}, //
+    {1996, 48, 6086559287}, //
+    {1997, 48, 6096975918}, //
+    {1998, 48, 3424123719}, //
+    {1992, 49, 6656732408}, //
+    {1993, 49, 6439163794}, //
+    {1994, 49, 7086198960}, //
+    {1995, 49, 6801494826}, //
+    {1996, 49, 6380057064}, //
+    {1997, 49, 6624824324}, //
+    {1998, 49, 4162459164}, //
+    {1992, 50, 6960570696}, //
+    {1993, 50, 7237748553}, //
+    {1994, 50, 6871669861}, //
+    {1995, 50, 7141121473}, //
+    {1996, 50, 7137131058}, //
+    {1997, 50, 6771705645}, //
+    {1998, 50, 4259523518}, //
+    {1992, 51, 6574132099}, //
+    {1993, 51, 6358522943}, //
+    {1994, 51, 6572086846}, //
+    {1995, 51, 6401898562}, //
+    {1996, 51, 6410252673}, //
+    {1997, 51, 6426732319}, //
+    {1998, 51, 3553327368}, //
+    {1992, 52, 7047940337}, //
+    {1993, 52, 6944255619}, //
+    {1994, 52, 6773948949}, //
+    {1995, 52, 6714537523}, //
+    {1996, 52, 6831573122}, //
+    {1997, 52, 6734349931}, //
+    {1998, 52, 3804080515}, //
+    {1992, 53, 6783046496}, //
+    {1993, 53, 6764674340}, //
+    {1994, 53, 6740138960}, //
+    {1995, 53, 7013631699}, //
+    {1996, 53, 6488850668}, //
+    {1997, 53, 6762927970}, //
+    {1998, 53, 3958631518}, //
+    {1992, 54, 7105405127}, //
+    {1993, 54, 7036373470}, //
+    {1994, 54, 7019061940}, //
+    {1995, 54, 6646909733}, //
+    {1996, 54, 6546458610}, //
+    {1997, 54, 6747693662}, //
+    {1998, 54, 3959735030}, //
+    {1992, 55, 7292018911}, //
+    {1993, 55, 6919737436}, //
+    {1994, 55, 6782433544}, //
+    {1995, 55, 6603309817}, //
+    {1996, 55, 6796414799}, //
+    {1997, 55, 6930467621}, //
+    {1998, 55, 3936336506}, //
+    {1992, 56, 7416913901}, //
+    {1993, 56, 7023034684}, //
+    {1994, 56, 6886859642}, //
+    {1995, 56, 7204223670}, //
+    {1996, 56, 7307030629}, //
+    {1997, 56, 7278012359}, //
+    {1998, 56, 4195358018}, //
+    {1992, 57, 7072285707}, //
+    {1993, 57, 6769724436}, //
+    {1994, 57, 7157349757}, //
+    {1995, 57, 6786320672}, //
+    {1996, 57, 7083167031}, //
+    {1997, 57, 6867387556}, //
+    {1998, 57, 3959867848}, //
+    {1992, 58, 6668044014}, //
+    {1993, 58, 6985920856}, //
+    {1994, 58, 6596737151}, //
+    {1995, 58, 6659827925}, //
+    {1996, 58, 6454616521}, //
+    {1997, 58, 6778311943}, //
+    {1998, 58, 3972101307}, //
+    {1992, 59, 6326657255}, //
+    {1993, 59, 6484432568}, //
+    {1994, 59, 6408868609}, //
+    {1995, 59, 6616633932}, //
+    {1996, 59, 6679260631}, //
+    {1997, 59, 6372927264}, //
+    {1998, 59, 3846617983}, //
+    {1992, 60, 7136768663}, //
+    {1993, 60, 6878576876}, //
+    {1994, 60, 6741527996}, //
+    {1995, 60, 6769490915}, //
+    {1996, 60, 6903254888}, //
+    {1997, 60, 6960241189}, //
+    {1998, 60, 4130144930}, //
+    {1992, 61, 6738756530}, //
+    {1993, 61, 6636715879}, //
+    {1994, 61, 6763648338}, //
+    {1995, 61, 6617746150}, //
+    {1996, 61, 6892805375}, //
+    {1997, 61, 6536932124}, //
+    {1998, 61, 3835627586}, //
+    {1992, 62, 6164211920}, //
+    {1993, 62, 6342711015}, //
+    {1994, 62, 6300859287}, //
+    {1995, 62, 6732584121}, //
+    {1996, 62, 6226736904}, //
+    {1997, 62, 6431206336}, //
+    {1998, 62, 3948296506}, //
+    {1992, 63, 7049023811}, //
+    {1993, 63, 7257074782}, //
+    {1994, 63, 7201434704}, //
+    {1995, 63, 7143627518}, //
+    {1996, 63, 7218551955}, //
+    {1997, 63, 7122881926}, //
+    {1998, 63, 4229058658}, //
+    {1992, 64, 6918049898}, //
+    {1993, 64, 6501113968}, //
+    {1994, 64, 7009023813}, //
+    {1995, 64, 6718571799}, //
+    {1996, 64, 6763605438}, //
+    {1997, 64, 7104613185}, //
+    {1998, 64, 3979838975}, //
+    {1992, 65, 7152976211}, //
+    {1993, 65, 7458190031}, //
+    {1994, 65, 6937644159}, //
+    {1995, 65, 6997669629}, //
+    {1996, 65, 6758440512}, //
+    {1997, 65, 6673821228}, //
+    {1998, 65, 4322914592}, //
+    {1992, 66, 6395458610}, //
+    {1993, 66, 6409727300}, //
+    {1994, 66, 6143698484}, //
+    {1995, 66, 6290853513}, //
+    {1996, 66, 6562894079}, //
+    {1997, 66, 6580940135}, //
+    {1998, 66, 3835167462}, //
+    {1992, 67, 6844550759}, //
+    {1993, 67, 7336259130}, //
+    {1994, 67, 7471700197}, //
+    {1995, 67, 7040096938}, //
+    {1996, 67, 7077296627}, //
+    {1997, 67, 7573027340}, //
+    {1998, 67, 4445654176}, //
+    {1992, 68, 6586538461}, //
+    {1993, 68, 6329817914}, //
+    {1994, 68, 6495180880}, //
+    {1995, 68, 6424478604}, //
+    {1996, 68, 6541879729}, //
+    {1997, 68, 6614661298}, //
+    {1998, 68, 3725370328}, //
+    {1992, 69, 6437149944}, //
+    {1993, 69, 6642939280}, //
+    {1994, 69, 6493295161}, //
+    {1995, 69, 6642164323}, //
+    {1996, 69, 6502125649}, //
+    {1997, 69, 6745438347}, //
+    {1998, 69, 3683114400}, //
+    {1992, 70, 6956560451}, //
+    {1993, 70, 7000267344}, //
+    {1994, 70, 6510307841}, //
+    {1995, 70, 6337688211}, //
+    {1996, 70, 7034861207}, //
+    {1997, 70, 6184992923}, //
+    {1998, 70, 3792560046}, //
+    {1992, 71, 7503528393}, //
+    {1993, 71, 7311857458}, //
+    {1994, 71, 7877750677}, //
+    {1995, 71, 7548275489}, //
+    {1996, 71, 7299486342}, //
+    {1997, 71, 7130260446}, //
+    {1998, 71, 4364145775}, //
+    {1992, 72, 6773151840}, //
+    {1993, 72, 6705723103}, //
+    {1994, 72, 6576032819}, //
+    {1995, 72, 6874053112}, //
+    {1996, 72, 6405666522}, //
+    {1997, 72, 6755654898}, //
+    {1998, 72, 4000181003}, //
+    {1992, 73, 6642879253}, //
+    {1993, 73, 6637877324}, //
+    {1994, 73, 6954631030}, //
+    {1995, 73, 6454765835}, //
+    {1996, 73, 6598056575}, //
+    {1997, 73, 6785666627}, //
+    {1998, 73, 3680466597}, //
+    {1992, 74, 7338251519}, //
+    {1993, 74, 7059280620}, //
+    {1994, 74, 7061358044}, //
+    {1995, 74, 7044215380}, //
+    {1996, 74, 7332374720}, //
+    {1997, 74, 7010120097}, //
+    {1998, 74, 4382878336}, //
+    {1992, 75, 6993738734}, //
+    {1993, 75, 7079353328}, //
+    {1994, 75, 7136558061}, //
+    {1995, 75, 7044955465}, //
+    {1996, 75, 6832731514}, //
+    {1997, 75, 6839863219}, //
+    {1998, 75, 4144643700}, //
+    {1992, 76, 7216305524}, //
+    {1993, 76, 7243703041}, //
+    {1994, 76, 7274388343}, //
+    {1995, 76, 7233806943}, //
+    {1996, 76, 6971700893}, //
+    {1997, 76, 7041104465}, //
+    {1998, 76, 3945668122}, //
+    {1992, 77, 5945520673}, //
+    {1993, 77, 6149061528}, //
+    {1994, 77, 5791875920}, //
+    {1995, 77, 5953806237}, //
+    {1996, 77, 6094857618}, //
+    {1997, 77, 6161765944}, //
+    {1998, 77, 3589143954}, //
+    {1992, 78, 6082662735}, //
+    {1993, 78, 6384605378}, //
+    {1994, 78, 6458393784}, //
+    {1995, 78, 6165718089}, //
+    {1996, 78, 6742644418}, //
+    {1997, 78, 6420560847}, //
+    {1998, 78, 3695789321}, //
+    {1992, 79, 7126061027}, //
+    {1993, 79, 7055353878}, //
+    {1994, 79, 7101978837}, //
+    {1995, 79, 6781640340}, //
+    {1996, 79, 7257992096}, //
+    {1997, 79, 6791270791}, //
+    {1998, 79, 4070644777}, //
+};
+
+inline SSBQuery2 ssb_q21_10 {ssb_q21_10_result_table, ssb_10};
+
+} // namespace fastlanes::ssb
+
+#endif // QUERY_21_HPP
diff --git a/fastlanes/src/include/query/query_3.hpp b/fastlanes/src/include/query/query_3.hpp
new file mode 100644
index 0000000..3a38f7c
--- /dev/null
+++ b/fastlanes/src/include/query/query_3.hpp
@@ -0,0 +1,44 @@
+#ifndef QUERY_3_HPP
+#define QUERY_3_HPP
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <vector>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+namespace fastlanes::ssb {
+
+struct SSBQuery3ResultRow {
+	int                col_0;
+	int                col_1;
+	int                col_2;
+	unsigned long long col_3;
+
+	bool operator==(const SSBQuery3ResultRow& rhs) const {
+		return (col_0 == rhs.col_0) && (col_1 == rhs.col_1) && (col_2 == rhs.col_2) && (col_3 == rhs.col_3);
+	}
+	SSBQuery3ResultRow(int col_0, int col_1, int col_2, unsigned long long col_3)
+	    : col_0(col_0)
+	    , col_1(col_1)
+	    , col_2(col_2)
+	    , col_3(col_3) {}
+};
+
+std::ostream& operator<<(std::ostream& stream, const SSBQuery3ResultRow& row) {
+	stream << "{" << row.col_0 << ", " << row.col_1 << ", " << row.col_2 << ", " << row.col_3 << "}";
+	return stream;
+}
+
+using SSBQuery3ResultTable = std::vector<SSBQuery3ResultRow>;
+
+struct SSBQuery3 {
+	const SSBQuery3ResultTable& reuslt;
+	const SSB&                  ssb;
+};
+
+} // namespace fastlanes::ssb
+
+#endif // QUERY_3_HPP
diff --git a/fastlanes/src/include/query/query_31.hpp b/fastlanes/src/include/query/query_31.hpp
new file mode 100644
index 0000000..9c18c90
--- /dev/null
+++ b/fastlanes/src/include/query/query_31.hpp
@@ -0,0 +1,165 @@
+#ifndef QUERY_31_HPP
+#define QUERY_31_HPP
+
+#include "query/query_3.hpp"
+
+namespace fastlanes::ssb {
+
+inline SSBQuery3ResultTable ssb_q31_10_result_table {
+    {1992, 8, 8, 53840255574},   //
+    {1993, 8, 8, 53166216941},   //
+    {1994, 8, 8, 53437240310},   //
+    {1995, 8, 8, 53396799768},   //
+    {1996, 8, 8, 54110132821},   //
+    {1997, 8, 8, 53398173290},   //
+    {1992, 9, 8, 56083363742},   //
+    {1993, 9, 8, 55223660082},   //
+    {1994, 9, 8, 55339397030},   //
+    {1995, 9, 8, 54949301113},   //
+    {1996, 9, 8, 55903082845},   //
+    {1997, 9, 8, 54769022116},   //
+    {1992, 12, 8, 52837317579},  //
+    {1993, 12, 8, 53383468103},  //
+    {1994, 12, 8, 52207914158},  //
+    {1995, 12, 8, 52862670951},  //
+    {1996, 12, 8, 52829409093},  //
+    {1997, 12, 8, 53020471016},  //
+    {1992, 18, 8, 53363391476},  //
+    {1993, 18, 8, 52946193531},  //
+    {1994, 18, 8, 52997321941},  //
+    {1995, 18, 8, 53672123936},  //
+    {1996, 18, 8, 53436907487},  //
+    {1997, 18, 8, 54352229494},  //
+    {1992, 21, 8, 53796356168},  //
+    {1993, 21, 8, 54304168176},  //
+    {1994, 21, 8, 53974392943},  //
+    {1995, 21, 8, 53857720297},  //
+    {1996, 21, 8, 54093512752},  //
+    {1997, 21, 8, 53598437998},  //
+    {1992, 8, 9, 55334149561},   //
+    {1993, 8, 9, 55669527348},   //
+    {1994, 8, 9, 54838930433},   //
+    {1995, 8, 9, 55981258937},   //
+    {1996, 8, 9, 56549465183},   //
+    {1997, 8, 9, 55451474341},   //
+    {1992, 9, 9, 57493556858},   //
+    {1993, 9, 9, 58025342779},   //
+    {1994, 9, 9, 57308767649},   //
+    {1995, 9, 9, 57866394299},   //
+    {1996, 9, 9, 58676834632},   //
+    {1997, 9, 9, 57151657961},   //
+    {1992, 12, 9, 55470271862},  //
+    {1993, 12, 9, 55450422145},  //
+    {1994, 12, 9, 55166732599},  //
+    {1995, 12, 9, 55756628069},  //
+    {1996, 12, 9, 55295862862},  //
+    {1997, 12, 9, 53377511976},  //
+    {1992, 18, 9, 56294215648},  //
+    {1993, 18, 9, 56167494867},  //
+    {1994, 18, 9, 55456868802},  //
+    {1995, 18, 9, 55888788272},  //
+    {1996, 18, 9, 56240855720},  //
+    {1997, 18, 9, 55624174081},  //
+    {1992, 21, 9, 56528084092},  //
+    {1993, 21, 9, 57031719413},  //
+    {1994, 21, 9, 56459028335},  //
+    {1995, 21, 9, 57672132145},  //
+    {1996, 21, 9, 56293030145},  //
+    {1997, 21, 9, 56215096026},  //
+    {1992, 8, 12, 51104583944},  //
+    {1993, 8, 12, 52291194128},  //
+    {1994, 8, 12, 52149700327},  //
+    {1995, 8, 12, 51756734585},  //
+    {1996, 8, 12, 52743929158},  //
+    {1997, 8, 12, 53618521846},  //
+    {1992, 9, 12, 54393567369},  //
+    {1993, 9, 12, 53410059754},  //
+    {1994, 9, 12, 53932060476},  //
+    {1995, 9, 12, 54260687958},  //
+    {1996, 9, 12, 55113622290},  //
+    {1997, 9, 12, 53884139975},  //
+    {1992, 12, 12, 52781570092}, //
+    {1993, 12, 12, 52683527061}, //
+    {1994, 12, 12, 50283319443}, //
+    {1995, 12, 12, 51809888688}, //
+    {1996, 12, 12, 52500376734}, //
+    {1997, 12, 12, 50815598125}, //
+    {1992, 18, 12, 52836119396}, //
+    {1993, 18, 12, 52943818670}, //
+    {1994, 18, 12, 52153400982}, //
+    {1995, 18, 12, 51839229204}, //
+    {1996, 18, 12, 53030051819}, //
+    {1997, 18, 12, 53281809182}, //
+    {1992, 21, 12, 53377618064}, //
+    {1993, 21, 12, 53970340911}, //
+    {1994, 21, 12, 54078621677}, //
+    {1995, 21, 12, 53174393671}, //
+    {1996, 21, 12, 52256511400}, //
+    {1997, 21, 12, 53064919288}, //
+    {1992, 8, 18, 51758985311},  //
+    {1993, 8, 18, 52173652875},  //
+    {1994, 8, 18, 52604990324},  //
+    {1995, 8, 18, 52587898615},  //
+    {1996, 8, 18, 51780518836},  //
+    {1997, 8, 18, 51906203038},  //
+    {1992, 9, 18, 53887104795},  //
+    {1993, 9, 18, 53920040836},  //
+    {1994, 9, 18, 53898996978},  //
+    {1995, 9, 18, 54122679431},  //
+    {1996, 9, 18, 54303106396},  //
+    {1997, 9, 18, 54430180840},  //
+    {1992, 12, 18, 51465172557}, //
+    {1993, 12, 18, 51406709327}, //
+    {1994, 12, 18, 52099528581}, //
+    {1995, 12, 18, 51320895827}, //
+    {1996, 12, 18, 51326040782}, //
+    {1997, 12, 18, 52052860907}, //
+    {1992, 18, 18, 53202304966}, //
+    {1993, 18, 18, 52197097507}, //
+    {1994, 18, 18, 52525946124}, //
+    {1995, 18, 18, 52421548431}, //
+    {1996, 18, 18, 53671108592}, //
+    {1997, 18, 18, 52788981021}, //
+    {1992, 21, 18, 53635069027}, //
+    {1993, 21, 18, 53806768582}, //
+    {1994, 21, 18, 52744648993}, //
+    {1995, 21, 18, 52784240366}, //
+    {1996, 21, 18, 53641429016}, //
+    {1997, 21, 18, 52632344235}, //
+    {1992, 8, 21, 49640993819},  //
+    {1993, 8, 21, 50661144654},  //
+    {1994, 8, 21, 50362372598},  //
+    {1995, 8, 21, 50516483322},  //
+    {1996, 8, 21, 51123449982},  //
+    {1997, 8, 21, 51125299004},  //
+    {1992, 9, 21, 51006397394},  //
+    {1993, 9, 21, 51824859693},  //
+    {1994, 9, 21, 51996233504},  //
+    {1995, 9, 21, 51968286051},  //
+    {1996, 9, 21, 53096102262},  //
+    {1997, 9, 21, 51759284236},  //
+    {1992, 12, 21, 49650941206}, //
+    {1993, 12, 21, 50057832135}, //
+    {1994, 12, 21, 50097922236}, //
+    {1995, 12, 21, 48627408805}, //
+    {1996, 12, 21, 50197634124}, //
+    {1997, 12, 21, 49149703784}, //
+    {1992, 18, 21, 50875757574}, //
+    {1993, 18, 21, 50618892442}, //
+    {1994, 18, 21, 50420152423}, //
+    {1995, 18, 21, 50255208143}, //
+    {1996, 18, 21, 50798876344}, //
+    {1997, 18, 21, 50981837552}, //
+    {1992, 21, 21, 49991192802}, //
+    {1993, 21, 21, 51428673225}, //
+    {1994, 21, 21, 49946254861}, //
+    {1995, 21, 21, 51328670072}, //
+    {1996, 21, 21, 50456326144}, //
+    {1997, 21, 21, 50401588878}, //
+};
+
+inline SSBQuery3 ssb_q31_10 {ssb_q31_10_result_table, ssb_10};
+
+} // namespace fastlanes::ssb
+
+#endif // QUERY_31_HPP
diff --git a/fastlanes/src/include/query/query_4.hpp b/fastlanes/src/include/query/query_4.hpp
new file mode 100644
index 0000000..cea0fe4
--- /dev/null
+++ b/fastlanes/src/include/query/query_4.hpp
@@ -0,0 +1,37 @@
+#ifndef QUERY_4_HPP
+#define QUERY_4_HPP
+
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <vector>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+namespace fastlanes::ssb {
+
+struct SSBQuery4ResultRow {
+	int                col_0;
+	int                col_1;
+	unsigned long long col_2;
+
+	bool operator==(const SSBQuery4ResultRow& rhs) const {
+		return (col_0 == rhs.col_0) && (col_1 == rhs.col_1) && (col_2 == rhs.col_2);
+	}
+	SSBQuery4ResultRow(int col_0, int col_1, unsigned long long col_2)
+		: col_0(col_0)
+		, col_1(col_1)
+		, col_2(col_2) {}
+};
+
+using SSBQuery4ResultTable = std::vector<SSBQuery4ResultRow>;
+
+struct SSBQuery4 {
+	const SSBQuery4ResultTable& reuslt;
+	const SSB&                  ssb;
+};
+
+} // namespace fastlanes::ssb
+
+#endif // QUERY_2_HPP
diff --git a/fastlanes/src/include/query/query_41.hpp b/fastlanes/src/include/query/query_41.hpp
new file mode 100644
index 0000000..1bd3ed3
--- /dev/null
+++ b/fastlanes/src/include/query/query_41.hpp
@@ -0,0 +1,50 @@
+#ifndef QUERY_41_HPP
+#define QUERY_41_HPP
+
+#include "query/query_4.hpp"
+
+namespace fastlanes::ssb {
+
+inline SSBQuery4ResultTable ssb_q41_10_result_table {
+	{1992, 1, 103719745491},      //
+	    {1993, 1, 104804149905},  //
+	    {1994, 1, 102680809322},  //
+	    {1995, 1, 104521470391},  //
+	    {1996, 1, 105409529511},  //
+	    {1997, 1, 103520208117},  //
+	    {1998, 1, 60245313373},   //
+	    {1992, 2, 106246161239},  //
+	    {1993, 2, 106198050501},  //
+	    {1994, 2, 106093079488},  //
+	    {1995, 2, 107568611750},  //
+	    {1996, 2, 106880639017},  //
+	    {1997, 2, 106690124662},  //
+	    {1998, 2, 61912349455},   //
+	    {1992, 3, 106647931375},  //
+	    {1993, 3, 107048690889},  //
+	    {1994, 3, 104514167652},  //
+	    {1995, 3, 105315997395},  //
+	    {1996, 3, 105586646448},  //
+	    {1997, 3, 106924659923},  //
+	    {1998, 3, 62738136949},   //
+	    {1992, 17, 104134609838}, //
+	    {1993, 17, 104651610426}, //
+	    {1994, 17, 104257308810}, //
+	    {1995, 17, 104390879969}, //
+	    {1996, 17, 105890415529}, //
+	    {1997, 17, 104161057567}, //
+	    {1998, 17, 62706700969},  //
+	    {1992, 24, 105245006839}, //
+	    {1993, 24, 104166556157}, //
+	    {1994, 24, 107595107297}, //
+	    {1995, 24, 104996502880}, //
+	    {1996, 24, 104859848521}, //
+	    {1997, 24, 105030361725}, //
+	    {1998, 24, 62169336083},  //
+};
+
+inline SSBQuery4 ssb_q41_10 {ssb_q41_10_result_table, ssb_10};
+
+} // namespace fastlanes::ssb
+
+#endif // QUERY_41_HPP
diff --git a/fastlanes/src/include/ssb_utils.h b/fastlanes/src/include/ssb_utils.h
new file mode 100644
index 0000000..4ebad99
--- /dev/null
+++ b/fastlanes/src/include/ssb_utils.h
@@ -0,0 +1,248 @@
+#pragma once
+
+#include <common.cuh>
+#include <fstream>
+#include <memory>
+#include <span>
+#include <ssb_utils.h>
+#include <string>
+
+class Dir {
+public:
+	idx_t       id;
+	std::string file_path;
+};
+
+using namespace std;
+
+#define STATS(COL)                                                                                                     \
+	{                                                                                                                  \
+		int32_t lo_##COL##_min = h_lo_##COL[0];                                                                        \
+		int32_t lo_##COL##_max = h_lo_##COL[0];                                                                        \
+		for (size_t i {0}; i < hard_coded.n_tup_line_order; ++i) {                                                     \
+			lo_##COL##_min = std::min(lo_##COL##_min, h_lo_##COL[i]);                                                  \
+			lo_##COL##_max = std::max(lo_##COL##_max, h_lo_##COL[i]);                                                  \
+		}                                                                                                              \
+		FLS_SHOW(lo_##COL##_min)                                                                                       \
+		FLS_SHOW(lo_##COL##_max)                                                                                       \
+		uint16_t x = RANGE_BIT(lo_##COL##_max - lo_##COL##_min);                                                       \
+		FLS_SHOW(x)                                                                                                    \
+	}
+
+namespace fastlanes::ssb {
+
+#define SF 10
+
+#if SF == 10
+class SSB {
+public:
+	const uint64_t n_tup_line_order;
+	const string   name;
+	const Dir      dir;
+
+	static constexpr int32_t lo_orderdate_min = 19920101;
+	static constexpr int32_t lo_orderdate_max = 19980802;
+	static constexpr uint8_t lo_orderdate_bw  = 16;
+
+	static constexpr int32_t lo_extendedprice_min = 90100;
+	static constexpr int32_t lo_extendedprice_max = 10494950;
+	static constexpr uint8_t lo_extendedprice_bw  = 24;
+
+	static constexpr int32_t lo_quantity_min = 1;
+	static constexpr int32_t lo_quantity_max = 50;
+	static constexpr uint8_t lo_quantity_bw  = 6;
+
+	static constexpr int32_t lo_discount_min = 1;
+	static constexpr int32_t lo_discount_max = 10;
+	static constexpr uint8_t lo_discount_bw  = 4;
+
+	static constexpr int32_t lo_partkey_min = 1;
+	static constexpr int32_t lo_partkey_max = 200000;
+	static constexpr uint8_t lo_partkey_bw  = 20;
+
+	static constexpr int32_t lo_suppkey_min       = 1;
+	static constexpr int32_t lo_suppkey_max       = 2000;
+	static constexpr uint8_t lo_real_suppkey_bw   = 15;
+	static constexpr uint8_t lo_chosen_suppkey_bw = 16;
+
+	static constexpr int32_t lo_revenue_min = 81360;
+	static constexpr int32_t lo_revenue_max = 10474950;
+	static constexpr uint8_t lo_revenue_bw  = 24;
+
+	static constexpr int32_t lo_custkey_min       = 1;
+	static constexpr int32_t lo_custkey_max       = 299999;
+	static constexpr uint8_t lo_real_custkey_bw   = 19;
+	static constexpr uint8_t lo_chosen_custkey_bw = 20;
+
+	static constexpr int32_t lo_supplycost_min       = 54060;
+	static constexpr int32_t lo_supplycost_max       = 125939;
+	static constexpr uint8_t lo_supplycost_bw        = 17;
+	static constexpr uint8_t lo_chosen_supplycost_bw = 20;
+
+	int32_t n_vec;
+};
+
+#elif SF == 1
+class SSB {
+public:
+	const uint64_t n_tup_line_order;
+	const string   name;
+	const Dir      dir;
+
+	static constexpr int32_t lo_orderdate_min = 19920101;
+	static constexpr int32_t lo_orderdate_max = 19980802;
+	static constexpr uint8_t lo_orderdate_bw  = 16;
+
+	static constexpr int32_t lo_extendedprice_min = 90100;
+	static constexpr int32_t lo_extendedprice_max = 10494950;
+	static constexpr uint8_t lo_extendedprice_bw  = 24;
+
+	static constexpr int32_t lo_quantity_min = 1;
+	static constexpr int32_t lo_quantity_max = 50;
+	static constexpr uint8_t lo_quantity_bw  = 6;
+
+	static constexpr int32_t lo_discount_min = 1;
+	static constexpr int32_t lo_discount_max = 10;
+	static constexpr uint8_t lo_discount_bw  = 4;
+
+	static constexpr int32_t lo_partkey_min = 1;
+	static constexpr int32_t lo_partkey_max = 200000;
+	static constexpr uint8_t lo_partkey_bw  = 18;
+
+	static constexpr int32_t lo_suppkey_min = 1;
+	static constexpr int32_t lo_suppkey_max = 2000;
+	static constexpr uint8_t lo_suppkey_bw  = 11;
+
+	static constexpr int32_t lo_revenue_min = 81360;
+	static constexpr int32_t lo_revenue_max = 10474950;
+	static constexpr uint8_t lo_revenue_bw  = 24;
+
+	static constexpr int32_t lo_custkey_min = 1;
+	static constexpr int32_t lo_custkey_max = 29999;
+	static constexpr uint8_t lo_custkey_bw  = 15;
+
+	static constexpr int32_t lo_supplycost_min = 54060;
+	static constexpr int32_t lo_supplycost_max = 125939;
+	static constexpr uint8_t lo_supplycost_bw  = 17;
+
+	int32_t n_vec;
+};
+#endif
+
+namespace sample_data::ssb_0_1 {
+inline std::string relative_path = std::string("/sample-data/ssb/sf_") + std::string("0_1") + std::string("/");
+inline std::string path          = CMAKE_SOURCE_DIR + relative_path;
+inline std::string table_path    = path + "tables/";
+inline std::string result_path   = path + "result/";
+inline std::string binary_path   = path + "binary/";
+inline Dir         customer_dir {0, table_path + "customer/customer.tbl"};
+inline Dir         lineorder_dir {1, table_path + "lineorder/lineorder.tbl"};
+inline Dir         part_dir {2, table_path + "part/part.tbl"};
+inline Dir         supplier_dir {3, table_path + "supplier/supplier.tbl"};
+
+} // namespace sample_data::ssb_0_1
+
+namespace sample_data::ssb_1 {
+inline std::string relative_path = std::string("/gpu/data/ssb/data/s1") + std::string("/");
+inline std::string path          = CMAKE_SOURCE_DIR + relative_path;
+inline std::string table_path    = path;
+inline std::string result_path   = path + "result/";
+inline std::string binary_path   = path + "binary/";
+inline Dir         customer_dir {0, table_path + "customer.tbl"};
+inline Dir         lineorder_dir {1, table_path + "lineorder.tbl"};
+inline Dir         part_dir {2, table_path + "part.tbl"};
+inline Dir         supplier_dir {3, table_path + "supplier.tbl"};
+} // namespace sample_data::ssb_1
+
+namespace sample_data::ssb_10 {
+inline std::string relative_path = std::string("/gpu/data/ssb/data/s10") + std::string("/");
+inline std::string path          = CMAKE_SOURCE_DIR + relative_path;
+inline std::string table_path    = path;
+inline std::string result_path   = path + "result/";
+inline std::string binary_path   = path + "binary/";
+inline Dir         customer_dir {0, table_path + "customer.tbl"};
+inline Dir         lineorder_dir {1, table_path + "lineorder.tbl"};
+inline Dir         part_dir {2, table_path + "part.tbl"};
+inline Dir         supplier_dir {3, table_path + "supplier.tbl"};
+}; // namespace sample_data::ssb_10
+
+inline SSB ssb_0_1 {600597, "SF_0_1", sample_data::ssb_0_1::lineorder_dir, 587};
+inline SSB ssb_1 {6001171, "SF_1", sample_data::ssb_1::lineorder_dir, 5861};
+inline SSB ssb_10 {59986214, "SF_10", sample_data::ssb_10::lineorder_dir, 58581};
+
+struct SSBQuery1 {
+	const uint64_t result;
+	const SSB&     ssb;
+};
+
+inline SSBQuery1 ssb_q11_0_1 {41307262627, ssb_0_1};
+inline SSBQuery1 ssb_q11_10 {4468236714181, ssb_10};
+
+inline SSBQuery1 ssb_q11_1 {446268068091, ssb_1};
+inline SSBQuery1 ssb_q12_1 {98314553869, ssb_1};
+inline SSBQuery1 ssb_q13_1 {24994512533, ssb_1};
+
+} // namespace fastlanes::ssb
+
+template <int SIZE>
+int32_t find_base(const int32_t in[]) {
+	auto result = *std::min_element(in, in + SIZE);
+	return result;
+}
+
+template <int SIZE, typename T>
+T find_max(T in[]) {
+	auto result = *std::max_element(in, in + SIZE);
+	return result;
+}
+
+template <int SIZE>
+void subtract_base(int32_t in[], const int32_t base) {
+	for (size_t i {0}; i < SIZE; ++i) {
+		if (in[i] < base) { throw std::runtime_error("base is the minimum!"); }
+		in[i] = in[i] - base;
+	}
+}
+
+template <int SIZE>
+void set_zero_after(int32_t in[], int32_t cur_idx) {
+	for (int32_t i {cur_idx}; i < SIZE; ++i) {
+		in[i] = 0;
+	}
+}
+
+inline double average(const int32_t* arr, const n_t n) {
+	int sum {0};
+	for (size_t i {0}; i < n; ++i) {
+		sum += arr[i];
+	}
+	return static_cast<double>(sum) / n;
+}
+
+template <int SIZE>
+int32_t find_bw(int32_t in[]) {
+	auto* in_p = reinterpret_cast<uint32_t*>(in);
+	auto  max  = find_max<SIZE, uint32_t>(in_p);
+	auto  bw   = std::bit_width(max);
+
+	if (bw == 0) { return 0; }
+	if (bw <= 4) { return 4; }
+	if (bw <= 8) { return 8; }
+	if (bw <= 12) { return 12; }
+	if (bw <= 16) { return 16; }
+	if (bw <= 20) { return 20; }
+	if (bw <= 24) { return 24; }
+	if (bw <= 28) { return 28; }
+	if (bw <= 32) { return 32; }
+
+	return bw;
+}
+
+template <typename T>
+bool is_sorted(const T* arr, int n) {
+
+	for (size_t i {0}; i < n - 1; ++i) {
+		if (arr[i] > arr[i + 1]) { return false; }
+	}
+	return true;
+}
\ No newline at end of file
diff --git a/fastlanes/src/include/util.cuh b/fastlanes/src/include/util.cuh
new file mode 100644
index 0000000..aa3c4cd
--- /dev/null
+++ b/fastlanes/src/include/util.cuh
@@ -0,0 +1,92 @@
+#ifndef FLS_GPU_UTIL_CUH
+#define FLS_GPU_UTIL_CUH
+
+namespace fasltanes::gpu{
+
+/**/
+template <typename T>
+T* loadColumnToGPU(T* src, int len) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, sizeof(T) * len);
+	cudaMemcpy(dest, src, sizeof(T) * len, cudaMemcpyHostToDevice);
+	return dest;
+}
+/**/
+inline void* load_arr(void* src, uint64_t bsz) {
+	void* dest = nullptr;
+	cudaMalloc((void**)&dest, bsz);
+	cudaMemcpy(dest, src, bsz, cudaMemcpyHostToDevice);
+	return dest;
+}
+
+template <typename T>
+inline T* load_arr(T* src, uint64_t bsz) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, bsz);
+	cudaMemcpy(dest, src, bsz, cudaMemcpyHostToDevice);
+	return dest;
+}
+
+#define CUDA_SAFE_CALL(call)                                                                                           \
+	do {                                                                                                               \
+		cudaError_t err = call;                                                                                        \
+		if (cudaSuccess != err) {                                                                                      \
+			fprintf(stderr, "Cuda error in file '%s' in line %i : %s.", __FILE__, __LINE__, cudaGetErrorString(err));  \
+			exit(EXIT_FAILURE);                                                                                        \
+		}                                                                                                              \
+	} while (0)
+
+#define SETUP_TIMING()                                                                                                 \
+	cudaEvent_t start, stop;                                                                                           \
+	cudaEventCreate(&start);                                                                                           \
+	cudaEventCreate(&stop);
+
+#define PERF(f, t)                                                                                                     \
+	{                                                                                                                  \
+		cudaEventRecord(start, 0);                                                                                     \
+		f;                                                                                                             \
+		cudaEventRecord(stop, 0);                                                                                      \
+		cudaEventSynchronize(stop);                                                                                    \
+		cudaEventElapsedTime(&t, start, stop);                                                                         \
+	}
+
+/*
+  RAM with a memory clock rate of 1,546 MHz and a 384-bit wide memory interface. Using these data items,
+  the peak theoretical memory bandwidth of the NVIDIA Tesla M2050 is 148 GB/sec, as computed in the following.
+
+    BWTheoretical = 1546 * 10^6 * (384/8) * 2 / 10^9 = 148 GB/s
+ */
+inline double BWTheoretical(int memoryClockRate, int memoryBusWidth) {
+	return ((memoryClockRate * (1e3)) * (static_cast<double>(memoryBusWidth) / 8) * 2) / static_cast<double>((1e9));
+}
+
+/*
+
+ BWEffective = (RB + WB) / (t * 109)
+
+ Here, BWEffective is the effective bandwidth in units of GB/s, RB is the number of bytes read per kernel,
+ WB is the number of bytes written per kernel, and t is the elapsed time given in seconds.
+
+ */
+inline double BWEffective(uint64_t read_bsz, uint64_t write_bsz, uint64_t milliseconds) {
+	return static_cast<double>(read_bsz + write_bsz) / milliseconds / 1e6;
+}
+
+template <typename T>
+class DevicePtr {
+public:
+	__device__ __host__ __inline__ explicit DevicePtr(T* a_p)
+	    : p(a_p) {}
+
+public:
+	T* p;
+};
+
+template <typename T>
+__host__ inline auto make_device_ptr(T* p) {
+	return DevicePtr<T>(p);
+}
+
+}
+
+#endif //FLS_GPU_UTIL_CUH
diff --git a/fastlanes/src/pack.cpp b/fastlanes/src/pack.cpp
new file mode 100644
index 0000000..11472c0
--- /dev/null
+++ b/fastlanes/src/pack.cpp
@@ -0,0 +1,29910 @@
+// generated!
+// NOLINTBEGIN
+#include "fls_gen/pack/pack.hpp"
+namespace generated { namespace pack::fallback { namespace scalar {
+void static pack_0bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {}
+void static pack_1bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src = *(in + 128 * 0 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp = src;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 1U;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 2U;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 3U;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 4U;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 5U;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 6U;
+		src = *(in + 128 * 7 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out -= 0;
+	}
+}
+void static pack_2bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src = *(in + 128 * 0 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 128 * 7 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out -= 128;
+	}
+}
+void static pack_3bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src = *(in + 128 * 0 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 3U;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 2U;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 1U;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 4U;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 1U;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 2U;
+		src = *(in + 128 * 7 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out -= 256;
+	}
+}
+void static pack_4bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src = *(in + 128 * 0 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 128 * 7 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out -= 384;
+	}
+}
+void static pack_5bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src = *(in + 128 * 0 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 3U;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 2U;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 1U;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 4U;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 1U;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 2U;
+		src = *(in + 128 * 7 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out -= 512;
+	}
+}
+void static pack_6bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src = *(in + 128 * 0 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 128 * 7 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out -= 640;
+	}
+}
+void static pack_7bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src = *(in + 128 * 0 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 1 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 1U;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 2 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 2U;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 3 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 3U;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 4 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 4U;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 5 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 5U;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 128;
+		src = *(in + 128 * 6 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 6U;
+		src = *(in + 128 * 7 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 1U;
+		*(out + i) = tmp;
+		out -= 768;
+	}
+}
+void static pack_8bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out) {
+	uint8_t tmp = 0U;
+	uint8_t src;
+	for (int i = 0; i < 128; i++) {
+		src        = *(in + 128 * 0 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 128;
+		src        = *(in + 128 * 1 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 128;
+		src        = *(in + 128 * 2 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 128;
+		src        = *(in + 128 * 3 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 128;
+		src        = *(in + 128 * 4 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 128;
+		src        = *(in + 128 * 5 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 128;
+		src        = *(in + 128 * 6 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 128;
+		src        = *(in + 128 * 7 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out -= 896;
+	}
+}
+void static pack_0bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {}
+void static pack_1bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 1U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 3U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 5U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 7U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 9U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 10U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 11U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 12U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 13U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 14U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out -= 0;
+	}
+}
+void static pack_2bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 10U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 12U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 10U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 12U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out -= 64;
+	}
+}
+void static pack_3bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 3U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 9U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 12U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 1U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 5U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 11U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 1U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 7U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 10U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out -= 128;
+	}
+}
+void static pack_4bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out -= 192;
+	}
+}
+void static pack_5bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 5U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 10U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 1U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 9U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 3U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 3U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 7U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 1U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out -= 256;
+	}
+}
+void static pack_6bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out -= 320;
+	}
+}
+void static pack_7bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 7U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 5U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 3U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 1U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 1U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 3U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 5U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out -= 384;
+	}
+}
+void static pack_8bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out -= 448;
+	}
+}
+void static pack_9bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 7U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 5U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 3U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 6U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 1U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 1U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 3U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 5U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out -= 512;
+	}
+}
+void static pack_10bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out -= 576;
+	}
+}
+void static pack_11bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 5U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 10U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 1U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 9U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 3U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 3U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 7U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 4U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 1U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out -= 640;
+	}
+}
+void static pack_12bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out -= 704;
+	}
+}
+void static pack_13bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 3U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 9U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 12U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 1U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 5U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 11U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 2U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 1U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 7U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 10U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out -= 768;
+	}
+}
+void static pack_14bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 10U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 12U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 10U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 12U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out -= 832;
+	}
+}
+void static pack_15bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src = *(in + 64 * 0 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 1 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 1U;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 2 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 2U;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 3 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 3U;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 4 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 4U;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 5 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 5U;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 6 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 6U;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 7 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 7U;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 8 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 8U;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 9 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 9U;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 10 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 10U;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 11 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 11U;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 12 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 12U;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 13 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 13U;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 64;
+		src = *(in + 64 * 14 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 14U;
+		src = *(in + 64 * 15 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 1U;
+		*(out + i) = tmp;
+		out -= 896;
+	}
+}
+void static pack_16bit_16ow(const uint16_t* __restrict in, uint16_t* __restrict out) {
+	uint16_t tmp = 0U;
+	uint16_t src;
+	for (int i = 0; i < 64; i++) {
+		src        = *(in + 64 * 0 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 1 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 2 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 3 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 4 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 5 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 6 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 7 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 8 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 9 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 10 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 11 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 12 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 13 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 14 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 64;
+		src        = *(in + 64 * 15 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out -= 960;
+	}
+}
+void static pack_0bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {}
+void static pack_1bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 17U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 19U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 21U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 23U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 25U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 26U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 27U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 28U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 29U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 30U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out -= 0;
+	}
+}
+void static pack_2bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 26U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 28U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 26U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 28U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out -= 32;
+	}
+}
+void static pack_3bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 21U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 27U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 19U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 25U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 28U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 17U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 23U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 26U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out -= 64;
+	}
+}
+void static pack_4bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out -= 96;
+	}
+}
+void static pack_5bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 25U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 23U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 21U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 26U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 19U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 17U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out -= 128;
+	}
+}
+void static pack_6bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out -= 160;
+	}
+}
+void static pack_7bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 21U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 17U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 24U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 23U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 19U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out -= 192;
+	}
+}
+void static pack_8bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out -= 224;
+	}
+}
+void static pack_9bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 22U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 17U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 21U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 19U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out -= 256;
+	}
+}
+void static pack_10bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out -= 288;
+	}
+}
+void static pack_11bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 17U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 19U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 20U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out -= 320;
+	}
+}
+void static pack_12bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out -= 352;
+	}
+}
+void static pack_13bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 17U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 18U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out -= 384;
+	}
+}
+void static pack_14bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out -= 416;
+	}
+}
+void static pack_15bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 15U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out -= 448;
+	}
+}
+void static pack_16bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out -= 480;
+	}
+}
+void static pack_17bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 14U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 13U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out -= 512;
+	}
+}
+void static pack_18bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out -= 544;
+	}
+}
+void static pack_19bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 12U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 11U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 17U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out -= 576;
+	}
+}
+void static pack_20bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out -= 608;
+	}
+}
+void static pack_21bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 10U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 9U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 17U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 19U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out -= 640;
+	}
+}
+void static pack_22bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out -= 672;
+	}
+}
+void static pack_23bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 17U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 21U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 7U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 19U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out -= 704;
+	}
+}
+void static pack_24bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out -= 736;
+	}
+}
+void static pack_25bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 21U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 17U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 5U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 23U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 19U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 6U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out -= 768;
+	}
+}
+void static pack_26bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out -= 800;
+	}
+}
+void static pack_27bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 25U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 23U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 21U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 26U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 19U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 3U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 17U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out -= 832;
+	}
+}
+void static pack_28bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out -= 864;
+	}
+}
+void static pack_29bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 21U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 27U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 2U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 19U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 25U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 28U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 1U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 17U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 23U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 26U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out -= 896;
+	}
+}
+void static pack_30bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 26U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 28U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 26U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 28U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out -= 928;
+	}
+}
+void static pack_31bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src = *(in + 32 * 0 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 1 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 1U;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 2 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 2U;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 3 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 3U;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 4 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 4U;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 5 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 5U;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 6 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 6U;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 7 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 7U;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 8 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 8U;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 9 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 9U;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 10 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 10U;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 11 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 11U;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 12 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 12U;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 13 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 13U;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 14 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 14U;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 15 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 15U;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 16 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 16U;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 17 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 17U;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 18 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 18U;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 19 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 19U;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 20 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 20U;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 21 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 21U;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 22 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 22U;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 23 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 23U;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 24 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 24U;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 25 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 25U;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 26 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 26U;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 27 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 27U;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 28 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 28U;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 29 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 29U;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 32;
+		src = *(in + 32 * 30 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 30U;
+		src = *(in + 32 * 31 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 1U;
+		*(out + i) = tmp;
+		out -= 960;
+	}
+}
+void static pack_32bit_32ow(const uint32_t* __restrict in, uint32_t* __restrict out) {
+	uint32_t tmp = 0U;
+	uint32_t src;
+	for (int i = 0; i < 32; i++) {
+		src        = *(in + 32 * 0 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 1 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 2 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 3 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 4 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 5 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 6 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 7 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 8 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 9 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 10 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 11 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 12 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 13 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 14 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 15 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 16 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 17 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 18 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 19 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 20 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 21 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 22 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 23 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 24 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 25 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 26 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 27 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 28 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 29 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 30 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 32;
+		src        = *(in + 32 * 31 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out -= 992;
+	}
+}
+void static pack_0bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {}
+void static pack_1bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 49U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 51U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 53U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 55U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 57U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 58U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 59U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 60U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 61U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 62U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 1) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out -= 0;
+	}
+}
+void static pack_2bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 58U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 60U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 58U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 60U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 2) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out -= 16;
+	}
+}
+void static pack_3bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 51U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 57U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 60U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 53U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 59U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 49U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 55U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 58U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 3) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out -= 32;
+	}
+}
+void static pack_4bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 4) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out -= 48;
+	}
+}
+void static pack_5bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 55U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 51U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 57U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 53U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 58U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 49U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 5) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out -= 64;
+	}
+}
+void static pack_6bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 6) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out -= 80;
+	}
+}
+void static pack_7bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 49U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 56U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 55U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 53U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 51U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 7) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out -= 96;
+	}
+}
+void static pack_8bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp = src;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 8) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out -= 112;
+	}
+}
+void static pack_9bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 54U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 53U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 51U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 49U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 9) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out -= 128;
+	}
+}
+void static pack_10bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 10) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out -= 144;
+	}
+}
+void static pack_11bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 52U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 49U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 51U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 11) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out -= 160;
+	}
+}
+void static pack_12bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 12) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out -= 176;
+	}
+}
+void static pack_13bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 49U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 50U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 13) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out -= 192;
+	}
+}
+void static pack_14bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 14) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out -= 208;
+	}
+}
+void static pack_15bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 47U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 15) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out -= 224;
+	}
+}
+void static pack_16bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp = src;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 16) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out -= 240;
+	}
+}
+void static pack_17bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 46U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 45U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 17) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out -= 256;
+	}
+}
+void static pack_18bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 18) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out -= 272;
+	}
+}
+void static pack_19bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 43U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 44U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 19) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out -= 288;
+	}
+}
+void static pack_20bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 20) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out -= 304;
+	}
+}
+void static pack_21bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 42U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 41U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 21) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out -= 320;
+	}
+}
+void static pack_22bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 22) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out -= 336;
+	}
+}
+void static pack_23bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 40U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 39U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 23) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out -= 352;
+	}
+}
+void static pack_24bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 24) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out -= 368;
+	}
+}
+void static pack_25bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 38U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 37U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 25) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out -= 384;
+	}
+}
+void static pack_26bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 26) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out -= 400;
+	}
+}
+void static pack_27bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 35U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 36U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 27) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out -= 416;
+	}
+}
+void static pack_28bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 28) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out -= 432;
+	}
+}
+void static pack_29bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 34U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 33U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 29) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out -= 448;
+	}
+}
+void static pack_30bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 30) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out -= 464;
+	}
+}
+void static pack_31bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 31U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 31) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out -= 480;
+	}
+}
+void static pack_32bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp = src;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 32) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out -= 496;
+	}
+}
+void static pack_33bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 30U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 29U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 33) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out -= 512;
+	}
+}
+void static pack_34bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 34) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out -= 528;
+	}
+}
+void static pack_35bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 27U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 28U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 35) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out -= 544;
+	}
+}
+void static pack_36bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 36) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out -= 560;
+	}
+}
+void static pack_37bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 26U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 25U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 37) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out -= 576;
+	}
+}
+void static pack_38bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 38) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out -= 592;
+	}
+}
+void static pack_39bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 23U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 39) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out -= 608;
+	}
+}
+void static pack_40bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 40) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out -= 624;
+	}
+}
+void static pack_41bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 21U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 22U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 41) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out -= 640;
+	}
+}
+void static pack_42bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 42) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out -= 656;
+	}
+}
+void static pack_43bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 19U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 20U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 43) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out -= 672;
+	}
+}
+void static pack_44bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 44) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out -= 688;
+	}
+}
+void static pack_45bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 18U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 17U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 45) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out -= 704;
+	}
+}
+void static pack_46bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 46) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out -= 720;
+	}
+}
+void static pack_47bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 15U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 47) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out -= 736;
+	}
+}
+void static pack_48bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 48) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out -= 752;
+	}
+}
+void static pack_49bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 13U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 14U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 49) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out -= 768;
+	}
+}
+void static pack_50bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 50) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out -= 784;
+	}
+}
+void static pack_51bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 11U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 49U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 51) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out -= 800;
+	}
+}
+void static pack_52bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 52) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out -= 816;
+	}
+}
+void static pack_53bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 9U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 10U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 49U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 51U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 53) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out -= 832;
+	}
+}
+void static pack_54bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 54) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out -= 848;
+	}
+}
+void static pack_55bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 53U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 51U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 49U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 7U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 55) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out -= 864;
+	}
+}
+void static pack_56bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 56) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out -= 880;
+	}
+}
+void static pack_57bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 49U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 55U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 53U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 5U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 51U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 6U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 57) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out -= 896;
+	}
+}
+void static pack_58bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 58) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out -= 912;
+	}
+}
+void static pack_59bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 55U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 4U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 51U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 3U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 57U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 53U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 58U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 49U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 59) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out -= 928;
+	}
+}
+void static pack_60bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 60) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out -= 944;
+	}
+}
+void static pack_61bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 51U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 57U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 60U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 1U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 53U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 59U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 2U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 49U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 55U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp = src >> 58U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 61) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out -= 960;
+	}
+}
+void static pack_62bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 58U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 60U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 58U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp = src >> 60U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 62) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out -= 976;
+	}
+}
+void static pack_63bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src = *(in + 16 * 0 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 63U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 1 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 1U;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 62U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 2 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 2U;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 61U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 3 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 3U;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 60U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 4 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 4U;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 59U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 5 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 5U;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 58U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 6 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 6U;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 57U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 7 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 7U;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 56U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 8 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 8U;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 55U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 9 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 9U;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 54U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 10 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 10U;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 53U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 11 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 11U;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 52U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 12 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 12U;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 51U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 13 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 13U;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 50U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 14 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 14U;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 49U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 15 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 15U;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 48U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 16 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 16U;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 47U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 17 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 17U;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 46U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 18 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 18U;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 45U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 19 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 19U;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 44U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 20 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 20U;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 43U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 21 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 21U;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 42U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 22 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 22U;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 41U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 23 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 23U;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 40U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 24 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 24U;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 39U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 25 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 25U;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 38U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 26 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 26U;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 37U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 27 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 27U;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 36U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 28 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 28U;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 35U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 29 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 29U;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 34U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 30 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 30U;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 33U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 31 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 31U;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 32U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 32 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 32U;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 31U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 33 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 33U;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 30U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 34 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 34U;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 29U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 35 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 35U;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 28U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 36 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 36U;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 27U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 37 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 37U;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 26U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 38 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 38U;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 25U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 39 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 39U;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 24U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 40 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 40U;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 23U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 41 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 41U;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 22U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 42 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 42U;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 21U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 43 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 43U;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 20U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 44 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 44U;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 19U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 45 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 45U;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 18U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 46 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 46U;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 17U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 47 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 47U;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 16U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 48 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 48U;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 15U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 49 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 49U;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 14U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 50 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 50U;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 13U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 51 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 51U;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 12U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 52 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 52U;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 11U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 53 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 53U;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 10U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 54 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 54U;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 9U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 55 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 55U;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 8U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 56 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 56U;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 7U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 57 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 57U;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 6U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 58 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 58U;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 5U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 59 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 59U;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 4U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 60 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 60U;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 3U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 61 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 61U;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 2U;
+		*(out + i) = tmp;
+		out += 16;
+		src = *(in + 16 * 62 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp = src >> 62U;
+		src = *(in + 16 * 63 + i);
+		src = src & ((1ULL << 63) - 1);
+		tmp |= src << 1U;
+		*(out + i) = tmp;
+		out -= 992;
+	}
+}
+void static pack_64bit_64ow(const uint64_t* __restrict in, uint64_t* __restrict out) {
+	uint64_t tmp = 0U;
+	uint64_t src;
+	for (int i = 0; i < 16; i++) {
+		src        = *(in + 16 * 0 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 1 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 2 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 3 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 4 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 5 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 6 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 7 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 8 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 9 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 10 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 11 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 12 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 13 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 14 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 15 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 16 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 17 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 18 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 19 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 20 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 21 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 22 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 23 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 24 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 25 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 26 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 27 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 28 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 29 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 30 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 31 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 32 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 33 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 34 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 35 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 36 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 37 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 38 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 39 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 40 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 41 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 42 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 43 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 44 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 45 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 46 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 47 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 48 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 49 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 50 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 51 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 52 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 53 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 54 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 55 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 56 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 57 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 58 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 59 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 60 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 61 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 62 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out += 16;
+		src        = *(in + 16 * 63 + i);
+		tmp        = src;
+		*(out + i) = tmp;
+		out -= 1008;
+	}
+}
+void pack(const uint8_t* __restrict in, uint8_t* __restrict out, uint8_t bw) {
+	switch (bw) {
+	case 0:
+		pack_0bit_8ow(in, out);
+		return;
+	case 1:
+		pack_1bit_8ow(in, out);
+		return;
+	case 2:
+		pack_2bit_8ow(in, out);
+		return;
+	case 3:
+		pack_3bit_8ow(in, out);
+		return;
+	case 4:
+		pack_4bit_8ow(in, out);
+		return;
+	case 5:
+		pack_5bit_8ow(in, out);
+		return;
+	case 6:
+		pack_6bit_8ow(in, out);
+		return;
+	case 7:
+		pack_7bit_8ow(in, out);
+		return;
+	case 8:
+		pack_8bit_8ow(in, out);
+		return;
+	}
+}
+void pack(const uint16_t* __restrict in, uint16_t* __restrict out, uint8_t bw) {
+	switch (bw) {
+	case 0:
+		pack_0bit_16ow(in, out);
+		return;
+	case 1:
+		pack_1bit_16ow(in, out);
+		return;
+	case 2:
+		pack_2bit_16ow(in, out);
+		return;
+	case 3:
+		pack_3bit_16ow(in, out);
+		return;
+	case 4:
+		pack_4bit_16ow(in, out);
+		return;
+	case 5:
+		pack_5bit_16ow(in, out);
+		return;
+	case 6:
+		pack_6bit_16ow(in, out);
+		return;
+	case 7:
+		pack_7bit_16ow(in, out);
+		return;
+	case 8:
+		pack_8bit_16ow(in, out);
+		return;
+	case 9:
+		pack_9bit_16ow(in, out);
+		return;
+	case 10:
+		pack_10bit_16ow(in, out);
+		return;
+	case 11:
+		pack_11bit_16ow(in, out);
+		return;
+	case 12:
+		pack_12bit_16ow(in, out);
+		return;
+	case 13:
+		pack_13bit_16ow(in, out);
+		return;
+	case 14:
+		pack_14bit_16ow(in, out);
+		return;
+	case 15:
+		pack_15bit_16ow(in, out);
+		return;
+	case 16:
+		pack_16bit_16ow(in, out);
+		return;
+	}
+}
+void pack(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw) {
+	switch (bw) {
+	case 0:
+		pack_0bit_32ow(in, out);
+		return;
+	case 1:
+		pack_1bit_32ow(in, out);
+		return;
+	case 2:
+		pack_2bit_32ow(in, out);
+		return;
+	case 3:
+		pack_3bit_32ow(in, out);
+		return;
+	case 4:
+		pack_4bit_32ow(in, out);
+		return;
+	case 5:
+		pack_5bit_32ow(in, out);
+		return;
+	case 6:
+		pack_6bit_32ow(in, out);
+		return;
+	case 7:
+		pack_7bit_32ow(in, out);
+		return;
+	case 8:
+		pack_8bit_32ow(in, out);
+		return;
+	case 9:
+		pack_9bit_32ow(in, out);
+		return;
+	case 10:
+		pack_10bit_32ow(in, out);
+		return;
+	case 11:
+		pack_11bit_32ow(in, out);
+		return;
+	case 12:
+		pack_12bit_32ow(in, out);
+		return;
+	case 13:
+		pack_13bit_32ow(in, out);
+		return;
+	case 14:
+		pack_14bit_32ow(in, out);
+		return;
+	case 15:
+		pack_15bit_32ow(in, out);
+		return;
+	case 16:
+		pack_16bit_32ow(in, out);
+		return;
+	case 17:
+		pack_17bit_32ow(in, out);
+		return;
+	case 18:
+		pack_18bit_32ow(in, out);
+		return;
+	case 19:
+		pack_19bit_32ow(in, out);
+		return;
+	case 20:
+		pack_20bit_32ow(in, out);
+		return;
+	case 21:
+		pack_21bit_32ow(in, out);
+		return;
+	case 22:
+		pack_22bit_32ow(in, out);
+		return;
+	case 23:
+		pack_23bit_32ow(in, out);
+		return;
+	case 24:
+		pack_24bit_32ow(in, out);
+		return;
+	case 25:
+		pack_25bit_32ow(in, out);
+		return;
+	case 26:
+		pack_26bit_32ow(in, out);
+		return;
+	case 27:
+		pack_27bit_32ow(in, out);
+		return;
+	case 28:
+		pack_28bit_32ow(in, out);
+		return;
+	case 29:
+		pack_29bit_32ow(in, out);
+		return;
+	case 30:
+		pack_30bit_32ow(in, out);
+		return;
+	case 31:
+		pack_31bit_32ow(in, out);
+		return;
+	case 32:
+		pack_32bit_32ow(in, out);
+		return;
+	}
+}
+void pack(const uint64_t* __restrict in, uint64_t* __restrict out, uint8_t bw) {
+	switch (bw) {
+	case 0:
+		pack_0bit_64ow(in, out);
+		return;
+	case 1:
+		pack_1bit_64ow(in, out);
+		return;
+	case 2:
+		pack_2bit_64ow(in, out);
+		return;
+	case 3:
+		pack_3bit_64ow(in, out);
+		return;
+	case 4:
+		pack_4bit_64ow(in, out);
+		return;
+	case 5:
+		pack_5bit_64ow(in, out);
+		return;
+	case 6:
+		pack_6bit_64ow(in, out);
+		return;
+	case 7:
+		pack_7bit_64ow(in, out);
+		return;
+	case 8:
+		pack_8bit_64ow(in, out);
+		return;
+	case 9:
+		pack_9bit_64ow(in, out);
+		return;
+	case 10:
+		pack_10bit_64ow(in, out);
+		return;
+	case 11:
+		pack_11bit_64ow(in, out);
+		return;
+	case 12:
+		pack_12bit_64ow(in, out);
+		return;
+	case 13:
+		pack_13bit_64ow(in, out);
+		return;
+	case 14:
+		pack_14bit_64ow(in, out);
+		return;
+	case 15:
+		pack_15bit_64ow(in, out);
+		return;
+	case 16:
+		pack_16bit_64ow(in, out);
+		return;
+	case 17:
+		pack_17bit_64ow(in, out);
+		return;
+	case 18:
+		pack_18bit_64ow(in, out);
+		return;
+	case 19:
+		pack_19bit_64ow(in, out);
+		return;
+	case 20:
+		pack_20bit_64ow(in, out);
+		return;
+	case 21:
+		pack_21bit_64ow(in, out);
+		return;
+	case 22:
+		pack_22bit_64ow(in, out);
+		return;
+	case 23:
+		pack_23bit_64ow(in, out);
+		return;
+	case 24:
+		pack_24bit_64ow(in, out);
+		return;
+	case 25:
+		pack_25bit_64ow(in, out);
+		return;
+	case 26:
+		pack_26bit_64ow(in, out);
+		return;
+	case 27:
+		pack_27bit_64ow(in, out);
+		return;
+	case 28:
+		pack_28bit_64ow(in, out);
+		return;
+	case 29:
+		pack_29bit_64ow(in, out);
+		return;
+	case 30:
+		pack_30bit_64ow(in, out);
+		return;
+	case 31:
+		pack_31bit_64ow(in, out);
+		return;
+	case 32:
+		pack_32bit_64ow(in, out);
+		return;
+	case 33:
+		pack_33bit_64ow(in, out);
+		return;
+	case 34:
+		pack_34bit_64ow(in, out);
+		return;
+	case 35:
+		pack_35bit_64ow(in, out);
+		return;
+	case 36:
+		pack_36bit_64ow(in, out);
+		return;
+	case 37:
+		pack_37bit_64ow(in, out);
+		return;
+	case 38:
+		pack_38bit_64ow(in, out);
+		return;
+	case 39:
+		pack_39bit_64ow(in, out);
+		return;
+	case 40:
+		pack_40bit_64ow(in, out);
+		return;
+	case 41:
+		pack_41bit_64ow(in, out);
+		return;
+	case 42:
+		pack_42bit_64ow(in, out);
+		return;
+	case 43:
+		pack_43bit_64ow(in, out);
+		return;
+	case 44:
+		pack_44bit_64ow(in, out);
+		return;
+	case 45:
+		pack_45bit_64ow(in, out);
+		return;
+	case 46:
+		pack_46bit_64ow(in, out);
+		return;
+	case 47:
+		pack_47bit_64ow(in, out);
+		return;
+	case 48:
+		pack_48bit_64ow(in, out);
+		return;
+	case 49:
+		pack_49bit_64ow(in, out);
+		return;
+	case 50:
+		pack_50bit_64ow(in, out);
+		return;
+	case 51:
+		pack_51bit_64ow(in, out);
+		return;
+	case 52:
+		pack_52bit_64ow(in, out);
+		return;
+	case 53:
+		pack_53bit_64ow(in, out);
+		return;
+	case 54:
+		pack_54bit_64ow(in, out);
+		return;
+	case 55:
+		pack_55bit_64ow(in, out);
+		return;
+	case 56:
+		pack_56bit_64ow(in, out);
+		return;
+	case 57:
+		pack_57bit_64ow(in, out);
+		return;
+	case 58:
+		pack_58bit_64ow(in, out);
+		return;
+	case 59:
+		pack_59bit_64ow(in, out);
+		return;
+	case 60:
+		pack_60bit_64ow(in, out);
+		return;
+	case 61:
+		pack_61bit_64ow(in, out);
+		return;
+	case 62:
+		pack_62bit_64ow(in, out);
+		return;
+	case 63:
+		pack_63bit_64ow(in, out);
+		return;
+	case 64:
+		pack_64bit_64ow(in, out);
+		return;
+	}
+}
+}}}; // namespace generated::pack::fallback::scalar
+// NOLINTEND
diff --git a/fastlanes/src/ssb/READMe.md b/fastlanes/src/ssb/READMe.md
new file mode 100644
index 0000000..2ec66ba
--- /dev/null
+++ b/fastlanes/src/ssb/READMe.md
@@ -0,0 +1,33 @@
+# Queries
+
+- q11 :
+    - v1 : is the best, used for paper
+    - v2 : SIMDIZED **does not work**
+    - v3 : MULTIPLE CHECK
+    - v4 : v1 with 8 value at a time **not complete yet**
+
+- q21 :
+    - v1
+    - v2: 8 value at a time
+    - v3: + predicate load on uncompressed data
+    - v4 : sorted data **not complete yet**
+
+- q31 :
+    - v1 :
+    - v2 : combination of shared + register
+    - v3 : 8 value at a time
+    - v4 : v3 + predicate load on uncompressed data
+    - v5 : v4 + sorted data
+
+- q41:
+    - v3 : SORTED + FOR ON ORDERDATE
+    - v4 : SORTED + FOR ON ORDERDATE and CUSTKEY
+
+---
+
+# Optimizations
+
+- **predicate load on uncompressed data** :
+- **8 value at a time**
+- **SIMDIZED**
+- **MULTIPLE CHECK**
\ No newline at end of file
diff --git a/fastlanes/src/ssb/compress_ssb.cu b/fastlanes/src/ssb/compress_ssb.cu
new file mode 100644
index 0000000..1bf4870
--- /dev/null
+++ b/fastlanes/src/ssb/compress_ssb.cu
@@ -0,0 +1,530 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/unpack/hardcoded_16.cuh>
+#include <iostream>
+#include <query/query_41.hpp>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q41_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate,
+                      int* lo_orderdate_bw,
+                      int* lo_orderdate_base,
+                      int* lo_orderdate_offset,
+                      int* lo_partkey,
+                      int* lo_custkey,
+                      int* lo_custkey_bw,
+                      int* lo_custkey_base,
+                      int* lo_custkey_offset,
+                      int* lo_suppkey,
+                      int* lo_revenue,
+                      int* lo_supplycost,
+                      int  lo_len,
+                      int* ht_p,
+                      int  p_len,
+                      int* ht_s,
+                      int  s_len,
+                      int* ht_c,
+                      int  c_len,
+                      int* ht_d,
+                      int  d_len,
+                      int* res) {
+	// Load a segment of consecutive items that are blocked across threads
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	// int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	int mtd_offset = blockIdx.x / 4;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int bw                  = lo_custkey_bw[mtd_offset];
+	int base                = lo_custkey_base[mtd_offset];
+	int custkey_tile_offset = lo_custkey_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+
+	bw                        = lo_orderdate_bw[mtd_offset];
+	base                      = lo_orderdate_base[mtd_offset];
+	int orderdate_tile_offset = lo_orderdate_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+	int supplycost_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_supplycost_bw * 8;
+	unpack_8_at_a_time::unpack_device(
+	    lo_supplycost + supplycost_tile_offset, items, query_mtd.ssb.lo_chosen_supplycost_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * 25);
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = c_nation[ITEM];
+				/*atomicAdd(&res[hash * 4 + 2], (1));*/
+				/*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+				          (long long)(revenue[ITEM] - items[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+	BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         lo_orderdate_bw,
+              int*                         lo_orderdate_base,
+              int*                         lo_orderdate_offset,
+              int*                         lo_custkey,
+              int*                         lo_custkey_bw,
+              int*                         lo_custkey_base,
+              int*                         lo_custkey_offset,
+              int*                         lo_partkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int*                         lo_supplycost,
+              int                          lo_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         p_partkey,
+              int*                         p_mfgr,
+              int                          p_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int                          s_len,
+              int*                         c_custkey,
+              int*                         c_region,
+              int*                         c_nation,
+              int                          c_len,
+              cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float time_query;
+
+	cudaEventRecord(start, 0);
+
+	int *ht_d, *ht_c, *ht_s, *ht_p;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	int* s_res = new int[s_len * 2];
+	CubDebugExit(cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int* c_res = new int[c_len * 2];
+	CubDebugExit(cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_mfgr, p_partkey, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int* p_res = new int[p_len * 2];
+	CubDebugExit(cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25);
+	int  ht_entries     = 4; // int,int,long long
+	int  res_array_size = res_size * ht_entries;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	probe<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+	                                                                lo_orderdate_bw,
+	                                                                lo_orderdate_base,
+	                                                                lo_orderdate_offset,
+	                                                                lo_partkey,
+	                                                                lo_custkey,
+	                                                                lo_custkey_bw,
+	                                                                lo_custkey_base,
+	                                                                lo_custkey_offset,
+	                                                                lo_suppkey,
+	                                                                lo_revenue,
+	                                                                lo_supplycost,
+	                                                                lo_len,
+	                                                                ht_p,
+	                                                                p_len,
+	                                                                ht_s,
+	                                                                s_len,
+	                                                                ht_c,
+	                                                                c_len,
+	                                                                ht_d,
+	                                                                d_val_len,
+	                                                                res);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	// cout << "Result:" << endl;
+	// int res_count = 0;
+	// for (int i = 0; i < res_size; i++) {
+	// 	if (h_res[4 * i] != 0) {
+	// 		cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+	// 		     << reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0] << endl;
+	// 		res_count += 1;
+	// 	}
+	// }
+	//
+	// cout << "Res Count: " << res_count << endl;
+
+	ssb::SSBQuery4ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			result_of_query.emplace_back(
+			    h_res[4 * i], h_res[4 * i + 1], reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q41_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q41_10.reuslt);
+
+	delete[] h_res;
+}
+
+/**
+ * Main
+ */
+int main() {
+	auto hard_coded      = query_mtd.ssb;
+	int* h_lo_orderdate  = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_suppkey    = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_custkey    = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_partkey    = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_revenue    = loadColumn<int>("lo_revenue", LO_LEN);
+	int* h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+
+	int* h_lo_orderdate_base   = new int[n_vec];
+	int* h_lo_orderdate_bw     = new int[n_vec];
+	int* h_lo_orderdate_offset = new int[n_vec];
+
+	const int* h_enc_lo_orderdate  = new int[n_vec * 1024];
+	int*       h_lo_custkey_base   = new int[n_vec];
+	int*       h_lo_custkey_bw     = new int[n_vec];
+	int*       h_lo_custkey_offset = new int[n_vec];
+
+	const int* h_enc_lo_custkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue    = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_supplycost = new int[n_vec * 1024];
+
+	auto* orderdate_in  = const_cast<int32_t*>(tmp);
+	auto* custkey_in    = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in    = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in    = const_cast<int32_t*>(h_lo_revenue);
+	auto* partkey_in    = const_cast<int32_t*>(h_lo_partkey);
+	auto* supplycost_in = const_cast<int32_t*>(h_lo_supplycost);
+
+	auto* orderdate_out  = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out    = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out    = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out    = const_cast<int32_t*>(h_enc_lo_revenue);
+	auto* partkey_out    = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* supplycost_out = const_cast<int32_t*>(h_enc_lo_supplycost);
+
+	constexpr int SF10_LAST_VECTOR_IDX = 58580;
+	constexpr int LAST_VECTOR_SIZE     = 294;
+
+	h_lo_orderdate_offset[0] = 0;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(custkey_in, LAST_VECTOR_SIZE); }
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(orderdate_in, LAST_VECTOR_SIZE); }
+
+		h_lo_orderdate_base[vec_idx] = find_base<1024>(orderdate_in);
+		subtract_base<1024>(orderdate_in, h_lo_orderdate_base[vec_idx]);
+		h_lo_orderdate_bw[vec_idx] = find_bw<1024>(orderdate_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_orderdate_offset[vec_idx + 1] = h_lo_orderdate_offset[vec_idx] + (h_lo_orderdate_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_orderdate_bw[vec_idx] > 16) {
+			throw std::runtime_error(" bigger than 16 is not possible in orderdate!");
+		}
+
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, h_lo_orderdate_bw[vec_idx]);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (h_lo_orderdate_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(supplycost_in, supplycost_out, hard_coded.lo_chosen_supplycost_bw);
+		supplycost_in  = supplycost_in + 1024;
+		supplycost_out = supplycost_out + (hard_coded.lo_chosen_supplycost_bw * 32);
+
+		h_lo_custkey_base[vec_idx] = find_base<1024>(custkey_in);
+		subtract_base<1024>(custkey_in, h_lo_custkey_base[vec_idx]);
+		h_lo_custkey_bw[vec_idx] = find_bw<1024>(custkey_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_custkey_offset[vec_idx + 1] = h_lo_custkey_offset[vec_idx] + (h_lo_custkey_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_custkey_bw[vec_idx] > 20) { throw std::runtime_error("bigger than 20 is not possible in custkey!"); }
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, h_lo_custkey_bw[vec_idx]);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (h_lo_custkey_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	std::cout << "average of  orderdate bw | " << average(h_lo_orderdate_bw, n_vec) << std::endl;
+	std::cout << "average of  custkey bw | " << average(h_lo_custkey_bw, n_vec) << std::endl;
+
+	int* d_lo_orderdate  = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey    = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey    = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue    = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_partkey    = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_supplycost = loadToGPU<int32_t>(h_enc_lo_supplycost, hard_coded.n_tup_line_order, g_allocator);
+
+	int* h_d_datekey      = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year         = loadColumn<int>("d_year", D_LEN);
+	int* h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_mfgr    = loadColumn<int>("p_mfgr", P_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+
+	int* d_lo_orderdate_base   = loadToGPU<int32_t>(h_lo_orderdate_base, n_vec, g_allocator);
+	int* d_lo_orderdate_bw     = loadToGPU<int32_t>(h_lo_orderdate_bw, n_vec, g_allocator);
+	int* d_lo_orderdate_offset = loadToGPU<int32_t>(h_lo_orderdate_offset, n_vec, g_allocator);
+
+	int* d_lo_custkey_base   = loadToGPU<int32_t>(h_lo_custkey_base, n_vec, g_allocator);
+	int* d_lo_custkey_bw     = loadToGPU<int32_t>(h_lo_custkey_bw, n_vec, g_allocator);
+	int* d_lo_custkey_offset = loadToGPU<int32_t>(h_lo_custkey_offset, n_vec, g_allocator);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_mfgr    = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	runQuery<32, 8>(d_lo_orderdate,
+	                d_lo_orderdate_bw,
+	                d_lo_orderdate_base,
+	                d_lo_orderdate_offset,
+	                d_lo_custkey,
+	                d_lo_custkey_bw,
+	                d_lo_custkey_base,
+	                d_lo_custkey_offset,
+	                d_lo_partkey,
+	                d_lo_suppkey,
+	                d_lo_revenue,
+	                d_lo_supplycost,
+	                LO_LEN,
+	                d_d_datekey,
+	                d_d_year,
+	                D_LEN,
+	                d_p_partkey,
+	                d_p_mfgr,
+	                P_LEN,
+	                d_s_suppkey,
+	                d_s_region,
+	                S_LEN,
+	                d_c_custkey,
+	                d_c_region,
+	                d_c_nation,
+	                C_LEN,
+	                g_allocator);
+
+	return 0;
+}
diff --git a/fastlanes/src/ssb/compress_ssb_sorted.cu b/fastlanes/src/ssb/compress_ssb_sorted.cu
new file mode 100644
index 0000000..d6fa818
--- /dev/null
+++ b/fastlanes/src/ssb/compress_ssb_sorted.cu
@@ -0,0 +1,533 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+#define SORTED
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/unpack/hardcoded_16.cuh>
+#include <iostream>
+#include <query/query_41.hpp>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q41_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate,
+                      int* lo_orderdate_bw,
+                      int* lo_orderdate_base,
+                      int* lo_orderdate_offset,
+                      int* lo_partkey,
+                      int* lo_custkey,
+                      int* lo_custkey_bw,
+                      int* lo_custkey_base,
+                      int* lo_custkey_offset,
+                      int* lo_suppkey,
+                      int* lo_revenue,
+                      int* lo_supplycost,
+                      int  lo_len,
+                      int* ht_p,
+                      int  p_len,
+                      int* ht_s,
+                      int  s_len,
+                      int* ht_c,
+                      int  c_len,
+                      int* ht_d,
+                      int  d_len,
+                      int* res) {
+	// Load a segment of consecutive items that are blocked across threads
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	// int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	int mtd_offset = blockIdx.x / 4;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int bw                  = lo_custkey_bw[mtd_offset];
+	int base                = lo_custkey_base[mtd_offset];
+	int custkey_tile_offset = lo_custkey_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+
+	bw                        = lo_orderdate_bw[mtd_offset];
+	base                      = lo_orderdate_base[mtd_offset];
+	int orderdate_tile_offset = lo_orderdate_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+	int supplycost_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_supplycost_bw * 8;
+	unpack_8_at_a_time::unpack_device(
+	    lo_supplycost + supplycost_tile_offset, items, query_mtd.ssb.lo_chosen_supplycost_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * 25);
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = c_nation[ITEM];
+				/*atomicAdd(&res[hash * 4 + 2], (1));*/
+				/*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+				          (long long)(revenue[ITEM] - items[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+	BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         lo_orderdate_bw,
+              int*                         lo_orderdate_base,
+              int*                         lo_orderdate_offset,
+              int*                         lo_custkey,
+              int*                         lo_custkey_bw,
+              int*                         lo_custkey_base,
+              int*                         lo_custkey_offset,
+              int*                         lo_partkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int*                         lo_supplycost,
+              int                          lo_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         p_partkey,
+              int*                         p_mfgr,
+              int                          p_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int                          s_len,
+              int*                         c_custkey,
+              int*                         c_region,
+              int*                         c_nation,
+              int                          c_len,
+              cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float time_query;
+
+	cudaEventRecord(start, 0);
+
+	int *ht_d, *ht_c, *ht_s, *ht_p;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	int* s_res = new int[s_len * 2];
+	CubDebugExit(cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int* c_res = new int[c_len * 2];
+	CubDebugExit(cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_mfgr, p_partkey, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int* p_res = new int[p_len * 2];
+	CubDebugExit(cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25);
+	int  ht_entries     = 4; // int,int,long long
+	int  res_array_size = res_size * ht_entries;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	probe<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+	                                                                lo_orderdate_bw,
+	                                                                lo_orderdate_base,
+	                                                                lo_orderdate_offset,
+	                                                                lo_partkey,
+	                                                                lo_custkey,
+	                                                                lo_custkey_bw,
+	                                                                lo_custkey_base,
+	                                                                lo_custkey_offset,
+	                                                                lo_suppkey,
+	                                                                lo_revenue,
+	                                                                lo_supplycost,
+	                                                                lo_len,
+	                                                                ht_p,
+	                                                                p_len,
+	                                                                ht_s,
+	                                                                s_len,
+	                                                                ht_c,
+	                                                                c_len,
+	                                                                ht_d,
+	                                                                d_val_len,
+	                                                                res);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	// cout << "Result:" << endl;
+	// int res_count = 0;
+	// for (int i = 0; i < res_size; i++) {
+	// 	if (h_res[4 * i] != 0) {
+	// 		cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+	// 		     << reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0] << endl;
+	// 		res_count += 1;
+	// 	}
+	// }
+	//
+	// cout << "Res Count: " << res_count << endl;
+
+	ssb::SSBQuery4ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			result_of_query.emplace_back(
+			    h_res[4 * i], h_res[4 * i + 1], reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q41_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q41_10.reuslt);
+
+	delete[] h_res;
+}
+
+/**
+ * Main
+ */
+int main() {
+	auto hard_coded      = query_mtd.ssb;
+	int* h_lo_orderdate  = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_suppkey    = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_custkey    = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_partkey    = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_revenue    = loadColumn<int>("lo_revenue", LO_LEN);
+	int* h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	if (!is_sorted(h_lo_orderdate, LO_LEN)) { throw std::runtime_error("it is not sorted"); }
+
+	int* h_lo_orderdate_base   = new int[n_vec];
+	int* h_lo_orderdate_bw     = new int[n_vec];
+	int* h_lo_orderdate_offset = new int[n_vec];
+
+	const int* h_enc_lo_orderdate  = new int[n_vec * 1024];
+	int*       h_lo_custkey_base   = new int[n_vec];
+	int*       h_lo_custkey_bw     = new int[n_vec];
+	int*       h_lo_custkey_offset = new int[n_vec];
+
+	const int* h_enc_lo_custkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue    = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_supplycost = new int[n_vec * 1024];
+
+	auto* orderdate_in  = const_cast<int32_t*>(tmp);
+	auto* custkey_in    = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in    = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in    = const_cast<int32_t*>(h_lo_revenue);
+	auto* partkey_in    = const_cast<int32_t*>(h_lo_partkey);
+	auto* supplycost_in = const_cast<int32_t*>(h_lo_supplycost);
+
+	auto* orderdate_out  = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out    = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out    = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out    = const_cast<int32_t*>(h_enc_lo_revenue);
+	auto* partkey_out    = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* supplycost_out = const_cast<int32_t*>(h_enc_lo_supplycost);
+
+	constexpr int SF10_LAST_VECTOR_IDX = 58580;
+	constexpr int LAST_VECTOR_SIZE     = 294;
+
+	h_lo_orderdate_offset[0] = 0;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(custkey_in, LAST_VECTOR_SIZE); }
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(orderdate_in, LAST_VECTOR_SIZE); }
+
+		h_lo_orderdate_base[vec_idx] = find_base<1024>(orderdate_in);
+		subtract_base<1024>(orderdate_in, h_lo_orderdate_base[vec_idx]);
+		h_lo_orderdate_bw[vec_idx] = find_bw<1024>(orderdate_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_orderdate_offset[vec_idx + 1] = h_lo_orderdate_offset[vec_idx] + (h_lo_orderdate_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_orderdate_bw[vec_idx] > 16) {
+			throw std::runtime_error(" bigger than 16 is not possible in orderdate!");
+		}
+
+
+
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, h_lo_orderdate_bw[vec_idx]);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (h_lo_orderdate_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(supplycost_in, supplycost_out, hard_coded.lo_chosen_supplycost_bw);
+		supplycost_in  = supplycost_in + 1024;
+		supplycost_out = supplycost_out + (hard_coded.lo_chosen_supplycost_bw * 32);
+
+		h_lo_custkey_base[vec_idx] = find_base<1024>(custkey_in);
+		subtract_base<1024>(custkey_in, h_lo_custkey_base[vec_idx]);
+		h_lo_custkey_bw[vec_idx] = find_bw<1024>(custkey_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_custkey_offset[vec_idx + 1] = h_lo_custkey_offset[vec_idx] + (h_lo_custkey_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_custkey_bw[vec_idx] > 20) { throw std::runtime_error("bigger than 20 is not possible in custkey!"); }
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, h_lo_custkey_bw[vec_idx]);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (h_lo_custkey_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	std::cout << "average of  orderdate bw | " << average(h_lo_orderdate_bw, n_vec) << std::endl;
+	std::cout << "average of  custkey bw | " << average(h_lo_custkey_bw, n_vec) << std::endl;
+
+	int* d_lo_orderdate  = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey    = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey    = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue    = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_partkey    = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_supplycost = loadToGPU<int32_t>(h_enc_lo_supplycost, hard_coded.n_tup_line_order, g_allocator);
+
+	int* h_d_datekey      = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year         = loadColumn<int>("d_year", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_mfgr    = loadColumn<int>("p_mfgr", P_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+
+	int* d_lo_orderdate_base   = loadToGPU<int32_t>(h_lo_orderdate_base, n_vec, g_allocator);
+	int* d_lo_orderdate_bw     = loadToGPU<int32_t>(h_lo_orderdate_bw, n_vec, g_allocator);
+	int* d_lo_orderdate_offset = loadToGPU<int32_t>(h_lo_orderdate_offset, n_vec, g_allocator);
+
+	int* d_lo_custkey_base   = loadToGPU<int32_t>(h_lo_custkey_base, n_vec, g_allocator);
+	int* d_lo_custkey_bw     = loadToGPU<int32_t>(h_lo_custkey_bw, n_vec, g_allocator);
+	int* d_lo_custkey_offset = loadToGPU<int32_t>(h_lo_custkey_offset, n_vec, g_allocator);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_mfgr    = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	runQuery<32, 8>(d_lo_orderdate,
+	                d_lo_orderdate_bw,
+	                d_lo_orderdate_base,
+	                d_lo_orderdate_offset,
+	                d_lo_custkey,
+	                d_lo_custkey_bw,
+	                d_lo_custkey_base,
+	                d_lo_custkey_offset,
+	                d_lo_partkey,
+	                d_lo_suppkey,
+	                d_lo_revenue,
+	                d_lo_supplycost,
+	                LO_LEN,
+	                d_d_datekey,
+	                d_d_year,
+	                D_LEN,
+	                d_p_partkey,
+	                d_p_mfgr,
+	                P_LEN,
+	                d_s_suppkey,
+	                d_s_region,
+	                S_LEN,
+	                d_c_custkey,
+	                d_c_region,
+	                d_c_nation,
+	                C_LEN,
+	                g_allocator);
+
+	return 0;
+}
diff --git a/fastlanes/src/ssb/fls_q11.cu b/fastlanes/src/ssb/fls_q11.cu
new file mode 100644
index 0000000..6872716
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q11.cu
@@ -0,0 +1,241 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal-opt/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack_fused.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+/*
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19930000 and lo_orderdate <= 19940000 and lo_discount>=1
+and lo_discount<=3
+and lo_quantity<25;
+
+*/
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel_v1(const int*          enc_lo_orderdate,
+                               const int*          enc_lo_discount,
+                               const int*          enc_lo_quantity,
+                               const int*          enc_lo_extendedprice,
+                               ssb::SSB            query_mtd,
+                               unsigned long long* revenue) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup_line_order + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup_line_order - tile_offset; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.lo_orderdate_bw * 32;
+	unpack_device(enc_lo_orderdate + orderdate_tile_offset, items, query_mtd.lo_orderdate_bw);
+	BlockPredGT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19930000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+	BlockPredAndLT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19940000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+
+	int quantity_tile_offset = blockIdx.x * query_mtd.lo_quantity_bw * 32;
+	unpack_device(enc_lo_quantity + quantity_tile_offset, items, query_mtd.lo_quantity_bw);
+	BlockPredAndLT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 25, selection_flags, num_tile_items);
+
+	int discount_tile_offset = blockIdx.x * query_mtd.lo_discount_bw * 32;
+	unpack_device(enc_lo_discount + discount_tile_offset, items, query_mtd.lo_discount_bw);
+	BlockPredAndGTE<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+	BlockPredAndLTE<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items);
+
+	int extendedprice_tile_offset = blockIdx.x * query_mtd.lo_extendedprice_bw * 32;
+	unpack_device(enc_lo_extendedprice + extendedprice_tile_offset, items2, query_mtd.lo_extendedprice_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+			if (selection_flags[ITEM]) sum += items[ITEM] * items2[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel_v2(const int*          enc_lo_orderdate,
+                               const int*          enc_lo_discount,
+                               const int*          enc_lo_quantity,
+                               int*                lo_extendedprice,
+                               ssb::SSB            query_mtd,
+                               unsigned long long* revenue) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup_line_order + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup_line_order - tile_offset; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.lo_orderdate_bw * 32;
+	unpack_device(enc_lo_orderdate + orderdate_tile_offset, items, query_mtd.lo_orderdate_bw);
+	BlockPredGT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19930000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+	BlockPredAndLT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19940000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+
+	int quantity_tile_offset = blockIdx.x * query_mtd.lo_quantity_bw * 32;
+	unpack_device(enc_lo_quantity + quantity_tile_offset, items, query_mtd.lo_quantity_bw);
+	BlockPredAndLT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 25, selection_flags, num_tile_items);
+
+	int discount_tile_offset = blockIdx.x * query_mtd.lo_discount_bw * 32;
+	unpack_device(enc_lo_discount + discount_tile_offset, items, query_mtd.lo_discount_bw);
+	BlockPredAndGTE<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+	BlockPredAndLTE<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+			if (selection_flags[ITEM]) sum += items[ITEM] * items2[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void query(int*                         lo_orderdate,
+           int*                         lo_discount,
+           int*                         lo_quantity,
+           int*                         lo_extendedprice,
+           ssb::SSBQuery1               query_mtd,
+           cub::CachingDeviceAllocator& g_allocator,
+           int                          version) {
+
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int num_blocks = (query_mtd.ssb.n_tup_line_order + tile_items - 1) / tile_items;
+
+	if (version == 1) {
+		QueryKernel_v1<BLOCK_THREADS, ITEMS_PER_THREAD><<<num_blocks, BLOCK_THREADS>>>(
+		    lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, query_mtd.ssb, d_sum);
+	} else if (version == 2) {
+		QueryKernel_v2<BLOCK_THREADS, ITEMS_PER_THREAD><<<num_blocks, BLOCK_THREADS>>>(
+		    lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, query_mtd.ssb, d_sum);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+	/*Check the result*/
+	ASSERT_EQ(revenue, query_mtd.result);
+
+	CLEANUP(d_sum);
+}
+
+int main(int argc, char* argv[]) {
+	int version = 0;
+	version     = std::stoi(argv[1]);
+
+	const auto query_mtd  = ssb::ssb_q11_10;
+	auto       hard_coded = query_mtd.ssb;
+
+	int* h_lo_orderdate     = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_discount      = loadColumn<int>("lo_discount", LO_LEN);
+	int* h_lo_quantity      = loadColumn<int>("lo_quantity", LO_LEN);
+	int* h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	const int* h_enc_lo_orderdate     = new int[n_vec * 1024];
+	const int* h_enc_lo_discount      = new int[n_vec * 1024];
+	const int* h_enc_lo_quantity      = new int[n_vec * 1024];
+	const int* h_enc_lo_extendedprice = new int[n_vec * 1024];
+
+	auto* orderdate_in     = const_cast<const int32_t*>(tmp);
+	auto* discount_in      = const_cast<int32_t*>(h_lo_discount);
+	auto* quantity_in      = const_cast<int32_t*>(h_lo_quantity);
+	auto* extendedprice_in = const_cast<int32_t*>(h_lo_extendedprice);
+
+	auto* orderdate_out     = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* discount_out      = const_cast<int32_t*>(h_enc_lo_discount);
+	auto* quantity_out      = const_cast<int32_t*>(h_enc_lo_quantity);
+	auto* extendedprice_out = const_cast<int32_t*>(h_enc_lo_extendedprice);
+
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+		generated::pack::fallback::scalar::pack(discount_in, discount_out, hard_coded.lo_discount_bw);
+		discount_in  = discount_in + 1024;
+		discount_out = discount_out + (hard_coded.lo_discount_bw * 32);
+
+		generated::pack::fallback::scalar::pack(quantity_in, quantity_out, hard_coded.lo_quantity_bw);
+		quantity_in  = quantity_in + 1024;
+		quantity_out = quantity_out + (hard_coded.lo_quantity_bw * 32);
+
+		generated::pack::fallback::scalar::pack(extendedprice_in, extendedprice_out, hard_coded.lo_extendedprice_bw);
+		extendedprice_in  = extendedprice_in + 1024;
+		extendedprice_out = extendedprice_out + (hard_coded.lo_extendedprice_bw * 32);
+	}
+
+	int* d_lo_orderdate = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_discount  = loadToGPU<int32_t>(h_enc_lo_discount, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_quantity  = loadToGPU<int32_t>(h_enc_lo_quantity, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_extendedprice;
+
+	if (version == 1) {
+		d_lo_extendedprice = loadToGPU<int32_t>(h_enc_lo_extendedprice, hard_coded.n_tup_line_order, g_allocator);
+	} else if (version == 2) {
+		d_lo_extendedprice = loadToGPU<int32_t>(h_lo_extendedprice, hard_coded.n_tup_line_order, g_allocator);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+
+	query<32, 32>(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, query_mtd, g_allocator, version);
+
+	return 0;
+}
\ No newline at end of file
diff --git a/fastlanes/src/ssb/fls_q11_bitpacked_opt_v2.cu b/fastlanes/src/ssb/fls_q11_bitpacked_opt_v2.cu
new file mode 100644
index 0000000..d30fbb7
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q11_bitpacked_opt_v2.cu
@@ -0,0 +1,233 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal.cuh"
+#include "cub/test/test_util.h"
+#include "data/footer/ssb/ssb.hpp"
+#include "fls_gen/unpack/hardcoded_16.cuh"
+// #include "fls_gen/unpack/unpack_fused.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <crystal_ssb_utils.h>
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+/*
+ * COMPRESSED
+ * SIMD
+ */
+
+constexpr uint32_t CONSTANT_1 = make_simd_const(19930000 - 19920101);
+constexpr uint32_t CONSTANT_2 = make_simd_const(19940000 - 19920101);
+constexpr uint32_t CONSTANT_3 = make_simd_const(25);
+constexpr uint32_t CONSTANT_4 = make_simd_const(3);
+constexpr uint32_t CONSTANT_5 = make_simd_const(1);
+
+template <int BLOCK_THREADS, int IPT>
+__global__ void QueryKernel(const int*          enc_lo_orderdate,
+                            const int*          enc_lo_discount,
+                            const int*          enc_lo_quantity,
+                            int*                lo_extendedprice,
+                            ssb::SSB            query_mtd,
+                            unsigned long long* revenue) {
+	constexpr int TILE_SIZE = BLOCK_THREADS * IPT;
+	// Load a segment of consecutive items that are blocked across threads
+	uint16_t items[IPT];
+	uint16_t selection_flags[IPT];
+	int      items2[IPT];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup_line_order + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup_line_order - tile_offset; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.lo_orderdate_bw * 32;
+	hardcoded::unpack_device(enc_lo_orderdate + orderdate_tile_offset, items, query_mtd.lo_orderdate_bw);
+
+	BlockPredGT_int_16_2<BLOCK_THREADS, IPT>(
+	    //
+	    reinterpret_cast<uint32_t(&)[IPT]>(items),
+	    CONSTANT_1,
+	    reinterpret_cast<uint32_t(&)[IPT]>(selection_flags),
+	    num_tile_items);
+	BlockPredAndLTX<BLOCK_THREADS, IPT>(
+	    //
+	    reinterpret_cast<uint32_t(&)[IPT]>(items),
+	    CONSTANT_2,
+	    reinterpret_cast<uint32_t(&)[IPT]>(selection_flags),
+	    num_tile_items);
+
+	int quantity_tile_offset = blockIdx.x * query_mtd.lo_quantity_bw * 32;
+	hardcoded::unpack_device(enc_lo_quantity + quantity_tile_offset, items, query_mtd.lo_quantity_bw);
+	// BlockPredAndLT<uint16_t, uint16_t, BLOCK_THREADS, IPT>(items, 25, selection_flags, num_tile_items);
+	BlockPredAndLTX<BLOCK_THREADS, IPT>(
+	    //
+	    reinterpret_cast<uint32_t(&)[IPT]>(items),
+	    CONSTANT_3,
+	    reinterpret_cast<uint32_t(&)[IPT]>(selection_flags),
+	    num_tile_items);
+
+	int discount_tile_offset = blockIdx.x * query_mtd.lo_discount_bw * 32;
+	hardcoded::unpack_device(enc_lo_discount + discount_tile_offset, items, query_mtd.lo_discount_bw);
+	// BlockPredAndGTE<uint16_t, uint16_t, BLOCK_THREADS, IPT>(items, 1, selection_flags, num_tile_items);
+	BlockPredAndGTEX<BLOCK_THREADS, IPT>(
+	    //
+	    reinterpret_cast<uint32_t(&)[IPT]>(items),
+	    CONSTANT_5,
+	    reinterpret_cast<uint32_t(&)[IPT]>(selection_flags),
+	    num_tile_items);
+
+	// BlockPredAndLTE<uint16_t, uint16_t, BLOCK_THREADS, IPT>(items, 3, selection_flags, num_tile_items);
+	BlockPredAndLTEX<BLOCK_THREADS, IPT>(
+	    //
+	    reinterpret_cast<uint32_t(&)[IPT]>(items),
+	    CONSTANT_4,
+	    reinterpret_cast<uint32_t(&)[IPT]>(selection_flags),
+	    num_tile_items);
+
+	BlockPredLoad<int, uint16_t, BLOCK_THREADS, IPT>(
+	    lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+	auto simd_flag = reinterpret_cast<uint32_t(&)[IPT]>(selection_flags);
+#pragma unroll
+	for (int SIMD_I = 0; SIMD_I < IPT / 2; ++SIMD_I) {
+		int REAL_I = SIMD_I * 2;
+		if (!simd_flag[SIMD_I]) { continue; }
+
+		if ((threadIdx.x + (BLOCK_THREADS * REAL_I) < num_tile_items))
+			if (selection_flags[REAL_I]) sum += items[REAL_I] * items2[REAL_I];
+
+		REAL_I = REAL_I + 1;
+		if ((threadIdx.x + (BLOCK_THREADS * REAL_I) < num_tile_items))
+			if (selection_flags[REAL_I]) sum += items[REAL_I] * items2[REAL_I];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long          aggregate = BlockSum<long long, BLOCK_THREADS, IPT>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query(int*                         lo_orderdate,
+            int*                         lo_discount,
+            int*                         lo_quantity,
+            int*                         lo_extendedprice,
+            ssb::SsbQueryMtd             query_mtd,
+            cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+
+	cudaEventRecord(start, 0);
+
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int num_blocks = (query_mtd.ssb.n_tup_line_order + tile_items - 1) / tile_items;
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<num_blocks, BLOCK_THREADS>>>(lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, query_mtd.ssb, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != query_mtd.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+	FLS_SUCCESS(query_mtd.ssb.name)
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+int main() {
+	int  num_trials  = 3;
+	auto queries_mtd = {
+	    //
+	    ssb::ssb_q11_10,
+	    //
+	};
+	for (const auto query_mtd : queries_mtd) {
+		auto hard_coded         = query_mtd.ssb;
+		int* h_lo_orderdate     = loadColumn<int>("lo_orderdate", LO_LEN);
+		int* h_lo_discount      = loadColumn<int>("lo_discount", LO_LEN);
+		int* h_lo_quantity      = loadColumn<int>("lo_quantity", LO_LEN);
+		int* h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+
+		auto n_vec = hard_coded.n_vec;
+
+		int* tmp = new int[n_vec * 1024];
+		for (size_t i {0}; i < LO_LEN; ++i) {
+			tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+		}
+
+		const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+		const int* h_enc_lo_discount  = new int[n_vec * 1024];
+		const int* h_enc_lo_quantity  = new int[n_vec * 1024];
+
+		auto* orderdate_in = const_cast<const int32_t*>(tmp);
+		auto* discount_in  = const_cast<int32_t*>(h_lo_discount);
+		auto* quantity_in  = const_cast<int32_t*>(h_lo_quantity);
+
+		auto* orderdate_out = const_cast<int32_t*>(h_enc_lo_orderdate);
+		auto* discount_out  = const_cast<int32_t*>(h_enc_lo_discount);
+		auto* quantity_out  = const_cast<int32_t*>(h_enc_lo_quantity);
+
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+			orderdate_in  = orderdate_in + 1024;
+			orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+			generated::pack::fallback::scalar::pack(discount_in, discount_out, hard_coded.lo_discount_bw);
+			discount_in  = discount_in + 1024;
+			discount_out = discount_out + (hard_coded.lo_discount_bw * 32);
+
+			generated::pack::fallback::scalar::pack(quantity_in, quantity_out, hard_coded.lo_quantity_bw);
+			quantity_in  = quantity_in + 1024;
+			quantity_out = quantity_out + (hard_coded.lo_quantity_bw * 32);
+		}
+
+		FLS_LOG("LOADED DATA")
+
+		int* d_lo_orderdate     = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_discount      = loadToGPU<int32_t>(h_enc_lo_discount, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_quantity      = loadToGPU<int32_t>(h_enc_lo_quantity, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_extendedprice = loadToGPU<int32_t>(h_lo_extendedprice, hard_coded.n_tup_line_order, g_allocator);
+
+		FLS_LOG("LOADED DATA TO GPU")
+
+		for (int n = 0; n < num_trials; n++) {
+			auto t =
+			    query<32, 32>(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, query_mtd, g_allocator);
+			FLS_RESULT(t)
+		}
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/fastlanes/src/ssb/fls_q11_bitpacked_opt_v3.cu b/fastlanes/src/ssb/fls_q11_bitpacked_opt_v3.cu
new file mode 100644
index 0000000..f451052
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q11_bitpacked_opt_v3.cu
@@ -0,0 +1,207 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal-opt/crystal.cuh"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack_fused.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <crystal_ssb_utils.h>
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+/*
+
+MULTIPLE CHECK
+
+*/
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel(const int*          enc_lo_orderdate,
+                            const int*          enc_lo_discount,
+                            const int*          enc_lo_quantity,
+                            int*                lo_extendedprice,
+                            ssb::SSB            query_mtd,
+                            unsigned long long* revenue) {
+	using SEL_T   = int8_t;
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int   items[ITEMS_PER_THREAD];
+	SEL_T selection_flags[ITEMS_PER_THREAD];
+	int   items2[ITEMS_PER_THREAD];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup_line_order + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup_line_order - tile_offset; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.lo_orderdate_bw * 32;
+	unpack_device(enc_lo_orderdate + orderdate_tile_offset, items, query_mtd.lo_orderdate_bw);
+	BlockPredGT<int, SEL_T, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19930000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+	BlockPredAndLT<int, SEL_T, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19940000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+
+	int quantity_tile_offset = blockIdx.x * query_mtd.lo_quantity_bw * 32;
+	unpack_device(enc_lo_quantity + quantity_tile_offset, items, query_mtd.lo_quantity_bw);
+	BlockPredAndLT<int, SEL_T, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 25, selection_flags, num_tile_items);
+
+	int discount_tile_offset = blockIdx.x * query_mtd.lo_discount_bw * 32;
+	unpack_device(enc_lo_discount + discount_tile_offset, items, query_mtd.lo_discount_bw);
+	BlockPredAndGTE<int, SEL_T, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+	BlockPredAndLTE<int, SEL_T, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items);
+
+	BlockPredLoad<int, SEL_T, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+	auto simd_flag = reinterpret_cast<uint32_t(&)[ITEMS_PER_THREAD / 4]>(selection_flags);
+#pragma unroll
+	for (int SIMD_I = 0; SIMD_I < ITEMS_PER_THREAD / 4; ++SIMD_I) {
+		int REAL_I = SIMD_I * 4;
+		if (!simd_flag[SIMD_I]) { continue; }
+
+		if ((threadIdx.x + (BLOCK_THREADS * REAL_I) < num_tile_items))
+			if (selection_flags[REAL_I]) sum += items[REAL_I] * items2[REAL_I];
+
+		REAL_I = REAL_I + 1;
+		if ((threadIdx.x + (BLOCK_THREADS * REAL_I) < num_tile_items))
+			if (selection_flags[REAL_I]) sum += items[REAL_I] * items2[REAL_I];
+
+		REAL_I = REAL_I + 1;
+		if ((threadIdx.x + (BLOCK_THREADS * REAL_I) < num_tile_items))
+			if (selection_flags[REAL_I]) sum += items[REAL_I] * items2[REAL_I];
+
+		REAL_I = REAL_I + 1;
+		if ((threadIdx.x + (BLOCK_THREADS * REAL_I) < num_tile_items))
+			if (selection_flags[REAL_I]) sum += items[REAL_I] * items2[REAL_I];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query(int*                         lo_orderdate,
+            int*                         lo_discount,
+            int*                         lo_quantity,
+            int*                         lo_extendedprice,
+            ssb::SSBQuery1             query_mtd,
+            cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+
+	cudaEventRecord(start, 0);
+
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int num_blocks = (query_mtd.ssb.n_tup_line_order + tile_items - 1) / tile_items;
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<num_blocks, BLOCK_THREADS>>>(lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, query_mtd.ssb, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != query_mtd.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+	FLS_SUCCESS(query_mtd.ssb.name)
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+int main() {
+	int  num_trials  = 3;
+	auto queries_mtd = {
+	    //
+	    ssb::ssb_q11_10,
+	    //
+	};
+	for (const auto query_mtd : queries_mtd) {
+		auto hard_coded         = query_mtd.ssb;
+		int* h_lo_orderdate     = loadColumn<int>("lo_orderdate", LO_LEN);
+		int* h_lo_discount      = loadColumn<int>("lo_discount", LO_LEN);
+		int* h_lo_quantity      = loadColumn<int>("lo_quantity", LO_LEN);
+		int* h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+
+		auto n_vec = hard_coded.n_vec;
+
+		int* tmp = new int[n_vec * 1024];
+		for (size_t i {0}; i < LO_LEN; ++i) {
+			tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+		}
+
+		const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+		const int* h_enc_lo_discount  = new int[n_vec * 1024];
+		const int* h_enc_lo_quantity  = new int[n_vec * 1024];
+
+		auto* orderdate_in = const_cast<const int32_t*>(tmp);
+		auto* discount_in  = const_cast<int32_t*>(h_lo_discount);
+		auto* quantity_in  = const_cast<int32_t*>(h_lo_quantity);
+
+		auto* orderdate_out = const_cast<int32_t*>(h_enc_lo_orderdate);
+		auto* discount_out  = const_cast<int32_t*>(h_enc_lo_discount);
+		auto* quantity_out  = const_cast<int32_t*>(h_enc_lo_quantity);
+
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+			orderdate_in  = orderdate_in + 1024;
+			orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+			generated::pack::fallback::scalar::pack(discount_in, discount_out, hard_coded.lo_discount_bw);
+			discount_in  = discount_in + 1024;
+			discount_out = discount_out + (hard_coded.lo_discount_bw * 32);
+
+			generated::pack::fallback::scalar::pack(quantity_in, quantity_out, hard_coded.lo_quantity_bw);
+			quantity_in  = quantity_in + 1024;
+			quantity_out = quantity_out + (hard_coded.lo_quantity_bw * 32);
+		}
+
+		FLS_LOG("LOADED DATA")
+
+		int* d_lo_orderdate     = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_discount      = loadToGPU<int32_t>(h_enc_lo_discount, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_quantity      = loadToGPU<int32_t>(h_enc_lo_quantity, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_extendedprice = loadToGPU<int32_t>(h_lo_extendedprice, hard_coded.n_tup_line_order, g_allocator);
+
+		FLS_LOG("LOADED DATA TO GPU")
+
+		for (int n = 0; n < num_trials; n++) {
+			auto t =
+			    query<32, 32>(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, query_mtd, g_allocator);
+			FLS_RESULT(t)
+		}
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/fastlanes/src/ssb/fls_q11_bitpacked_opt_v4.cu b/fastlanes/src/ssb/fls_q11_bitpacked_opt_v4.cu
new file mode 100644
index 0000000..df36788
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q11_bitpacked_opt_v4.cu
@@ -0,0 +1,194 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal-opt/crystal.cuh"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack_fused.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <crystal_ssb_utils.h>
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+/*
+
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19930000 and lo_orderdate <= 19940000 and lo_discount>=1
+and lo_discount<=3
+and lo_quantity<25;
+
+*/
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel(const int*          enc_lo_orderdate,
+                            const int*          enc_lo_discount,
+                            const int*          enc_lo_quantity,
+                            int*                lo_extendedprice,
+                            ssb::SSB            query_mtd,
+                            unsigned long long* revenue) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup_line_order + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup_line_order - tile_offset; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.lo_orderdate_bw * 32;
+	unpack_device(enc_lo_orderdate + orderdate_tile_offset, items, query_mtd.lo_orderdate_bw);
+	BlockPredGT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19930000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+	BlockPredAndLT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19940000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+
+	int quantity_tile_offset = blockIdx.x * query_mtd.lo_quantity_bw * 32;
+	unpack_device(enc_lo_quantity + quantity_tile_offset, items, query_mtd.lo_quantity_bw);
+	BlockPredAndLT<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 25, selection_flags, num_tile_items);
+
+	int discount_tile_offset = blockIdx.x * query_mtd.lo_discount_bw * 32;
+	unpack_device(enc_lo_discount + discount_tile_offset, items, query_mtd.lo_discount_bw);
+	BlockPredAndGTE<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+	BlockPredAndLTE<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+			if (selection_flags[ITEM]) sum += items[ITEM] * items2[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query(int*                         lo_orderdate,
+            int*                         lo_discount,
+            int*                         lo_quantity,
+            int*                         lo_extendedprice,
+            ssb::SSBQuery1             query_mtd,
+            cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+
+	cudaEventRecord(start, 0);
+
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int num_blocks = (query_mtd.ssb.n_tup_line_order + tile_items - 1) / tile_items;
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<num_blocks, BLOCK_THREADS>>>(lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, query_mtd.ssb, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != query_mtd.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+	FLS_SUCCESS(query_mtd.ssb.name)
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+int main() {
+	int  num_trials  = 3;
+	auto queries_mtd = {
+	    //
+	    ssb::ssb_q11_10,
+	    //
+	};
+	for (const auto query_mtd : queries_mtd) {
+		auto hard_coded         = query_mtd.ssb;
+		int* h_lo_orderdate     = loadColumn<int>("lo_orderdate", LO_LEN);
+		int* h_lo_discount      = loadColumn<int>("lo_discount", LO_LEN);
+		int* h_lo_quantity      = loadColumn<int>("lo_quantity", LO_LEN);
+		int* h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+
+		auto n_vec = hard_coded.n_vec;
+
+		int* tmp = new int[n_vec * 1024];
+		for (size_t i {0}; i < LO_LEN; ++i) {
+			tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+		}
+
+		const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+		const int* h_enc_lo_discount  = new int[n_vec * 1024];
+		const int* h_enc_lo_quantity  = new int[n_vec * 1024];
+
+		auto* orderdate_in = const_cast<const int32_t*>(tmp);
+		auto* discount_in  = const_cast<int32_t*>(h_lo_discount);
+		auto* quantity_in  = const_cast<int32_t*>(h_lo_quantity);
+
+		auto* orderdate_out = const_cast<int32_t*>(h_enc_lo_orderdate);
+		auto* discount_out  = const_cast<int32_t*>(h_enc_lo_discount);
+		auto* quantity_out  = const_cast<int32_t*>(h_enc_lo_quantity);
+
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+			orderdate_in  = orderdate_in + 1024;
+			orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+			generated::pack::fallback::scalar::pack(discount_in, discount_out, hard_coded.lo_discount_bw);
+			discount_in  = discount_in + 1024;
+			discount_out = discount_out + (hard_coded.lo_discount_bw * 32);
+
+			generated::pack::fallback::scalar::pack(quantity_in, quantity_out, hard_coded.lo_quantity_bw);
+			quantity_in  = quantity_in + 1024;
+			quantity_out = quantity_out + (hard_coded.lo_quantity_bw * 32);
+		}
+
+		FLS_LOG("LOADED DATA")
+
+		int* d_lo_orderdate     = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_discount      = loadToGPU<int32_t>(h_enc_lo_discount, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_quantity      = loadToGPU<int32_t>(h_enc_lo_quantity, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_extendedprice = loadToGPU<int32_t>(h_lo_extendedprice, hard_coded.n_tup_line_order, g_allocator);
+
+		FLS_LOG("LOADED DATA TO GPU")
+
+		for (int n = 0; n < num_trials; n++) {
+			auto t =
+			    query<32, 32>(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, query_mtd, g_allocator);
+			FLS_RESULT(t)
+		}
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/fastlanes/src/ssb/fls_q11_bp_crystal_opt.cu b/fastlanes/src/ssb/fls_q11_bp_crystal_opt.cu
new file mode 100644
index 0000000..f4c2832
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q11_bp_crystal_opt.cu
@@ -0,0 +1,264 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal.cuh"
+#include "cub/test/test_util.h"
+#include "data/footer/ssb/ssb.hpp"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+/*
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19930000 and lo_orderdate <= 19940000 and lo_discount>=1
+and lo_discount<=3
+and lo_quantity<25;
+*/
+
+// Query kernel for crystal opt
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernelOpt(const int*          enc_lo_orderdate,
+                               const int*          enc_lo_discount,
+                               const int*          enc_lo_quantity,
+                               const int*          enc_lo_extendedprice,
+                               ssb::SSB            query_mtd,
+                               unsigned long long* revenue) {
+
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+
+	static __shared__ int unpacked[1024];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup_line_order + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup_line_order - tile_offset; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.lo_orderdate_bw * 32;
+	unpack_device(enc_lo_orderdate + orderdate_tile_offset, unpacked, query_mtd.lo_orderdate_bw);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockPredGT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19930000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+	BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 19940000 - query_mtd.lo_orderdate_min, selection_flags, num_tile_items);
+
+	int quantity_tile_offset = blockIdx.x * query_mtd.lo_quantity_bw * 32;
+	unpack_device(enc_lo_quantity + quantity_tile_offset, unpacked, query_mtd.lo_quantity_bw);
+
+	BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items, selection_flags);
+	BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 25, selection_flags, num_tile_items);
+
+	int discount_tile_offset = blockIdx.x * query_mtd.lo_discount_bw * 32;
+	unpack_device(enc_lo_discount + discount_tile_offset, unpacked, query_mtd.lo_discount_bw);
+
+	BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items, selection_flags);
+	BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 1, selection_flags, num_tile_items);
+	BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, 3, selection_flags, num_tile_items);
+
+	int extendedprice_tile_offset = blockIdx.x * query_mtd.lo_extendedprice_bw * 32;
+	unpack_device(enc_lo_extendedprice + extendedprice_tile_offset, unpacked, query_mtd.lo_extendedprice_bw);
+
+	BlockPredLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+			if (selection_flags[ITEM])
+				sum += items[ITEM] * items2[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate =
+	    BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum,
+	                                                         (long long *)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) {
+		atomicAdd(revenue, aggregate);
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query(int*                         lo_orderdate,
+            int*                         lo_discount,
+            int*                         lo_quantity,
+            int*                         lo_extendedprice,
+            ssb::SsbQueryMtd             query_mtd,
+            cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+
+	cudaEventRecord(start, 0);
+
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int num_blocks = (query_mtd.ssb.n_tup_line_order + tile_items - 1) / tile_items;
+	QueryKernelOpt<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<num_blocks, BLOCK_THREADS>>>(lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, query_mtd.ssb, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != query_mtd.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+	FLS_SUCCESS(query_mtd.ssb.name)
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+int main() {
+	int  num_trials  = 1;
+	auto queries_mtd = {
+	    //
+	    //	    ssb::ssb_q11_1,
+	    //	    ssb::ssb_q11_0_1
+	    ssb::ssb_q11_10,
+	    //
+	};
+	for (const auto query_mtd : queries_mtd) {
+		auto       hard_coded         = query_mtd.ssb;
+		const auto lineorder          = Reader::ReadCsv(hard_coded.dir);
+		const auto rowgroup           = lineorder->GetFirstRowGroup();
+		const int* h_lo_orderdate     = rowgroup->GetChunk("LO_ORDERDATE").GetData<int32_t>();
+		const int* h_lo_discount      = rowgroup->GetChunk("LO_DISCOUNT").GetData<int32_t>();
+		const int* h_lo_quantity      = rowgroup->GetChunk("LO_QUANTITY").GetData<int32_t>();
+		const int* h_lo_extendedprice = rowgroup->GetChunk("LO_EXTENDEDPRICE").GetData<int32_t>();
+
+		int32_t lo_orderdate_min = h_lo_orderdate[0];
+		int32_t lo_orderdate_max = h_lo_orderdate[0];
+		for (size_t i {0}; i < hard_coded.n_tup_line_order; ++i) {
+			lo_orderdate_min = std::min(lo_orderdate_min, h_lo_orderdate[i]);
+			lo_orderdate_max = std::max(lo_orderdate_max, h_lo_orderdate[i]);
+		}
+		FLS_SHOW(lo_orderdate_min)
+		FLS_SHOW(lo_orderdate_max)
+
+		int32_t lo_discount_min = h_lo_discount[0];
+		int32_t lo_discount_max = h_lo_discount[0];
+		for (size_t i {0}; i < hard_coded.n_tup_line_order; ++i) {
+			lo_discount_min = std::min(lo_discount_min, h_lo_discount[i]);
+			lo_discount_max = std::max(lo_discount_max, h_lo_discount[i]);
+		}
+		FLS_SHOW(lo_discount_min)
+		FLS_SHOW(lo_discount_max)
+
+		int32_t lo_quantity_min = h_lo_quantity[0];
+		int32_t lo_quantity_max = h_lo_quantity[0];
+		for (size_t i {0}; i < hard_coded.n_tup_line_order; ++i) {
+			lo_quantity_min = std::min(lo_quantity_min, h_lo_quantity[i]);
+			lo_quantity_max = std::max(lo_quantity_max, h_lo_quantity[i]);
+		}
+		FLS_SHOW(lo_quantity_min)
+		FLS_SHOW(lo_quantity_max)
+
+		int32_t lo_extendedprice_min = h_lo_extendedprice[0];
+		int32_t lo_extendedprice_max = h_lo_extendedprice[0];
+		for (size_t i {0}; i < hard_coded.n_tup_line_order; ++i) {
+			lo_extendedprice_min = std::min(lo_extendedprice_min, h_lo_extendedprice[i]);
+			lo_extendedprice_max = std::max(lo_extendedprice_max, h_lo_extendedprice[i]);
+		}
+		FLS_SHOW(lo_extendedprice_min)
+		FLS_SHOW(lo_extendedprice_max)
+
+		uint16_t x = RANGE_BIT(lo_orderdate_max - lo_orderdate_min);
+		FLS_SHOW(x)
+
+		auto n_vec = hard_coded.n_vec;
+
+		int* tmp = new int[n_vec * 1024];
+		for (size_t i {0}; i < rowgroup->RowCount(); ++i) {
+			tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+		}
+
+		const int* h_enc_lo_orderdate     = new int[n_vec * 1024];
+		const int* h_enc_lo_discount      = new int[n_vec * 1024];
+		const int* h_enc_lo_quantity      = new int[n_vec * 1024];
+		const int* h_enc_lo_extendedprice = new int[n_vec * 1024];
+
+		auto* orderdate_in     = const_cast<const int32_t*>(tmp);
+		auto* discount_in      = const_cast<int32_t*>(h_lo_discount);
+		auto* quantity_in      = const_cast<int32_t*>(h_lo_quantity);
+		auto* extendedprice_in = const_cast<int32_t*>(h_lo_extendedprice);
+
+		auto* orderdate_out     = const_cast<int32_t*>(h_enc_lo_orderdate);
+		auto* discount_out      = const_cast<int32_t*>(h_enc_lo_discount);
+		auto* quantity_out      = const_cast<int32_t*>(h_enc_lo_quantity);
+		auto* extendedprice_out = const_cast<int32_t*>(h_enc_lo_extendedprice);
+
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+			orderdate_in  = orderdate_in + 1024;
+			orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+			generated::pack::fallback::scalar::pack(discount_in, discount_out, hard_coded.lo_discount_bw);
+			discount_in  = discount_in + 1024;
+			discount_out = discount_out + (hard_coded.lo_discount_bw * 32);
+
+			generated::pack::fallback::scalar::pack(quantity_in, quantity_out, hard_coded.lo_quantity_bw);
+			quantity_in  = quantity_in + 1024;
+			quantity_out = quantity_out + (hard_coded.lo_quantity_bw * 32);
+
+			generated::pack::fallback::scalar::pack(
+			    extendedprice_in, extendedprice_out, hard_coded.lo_extendedprice_bw);
+			extendedprice_in  = extendedprice_in + 1024;
+			extendedprice_out = extendedprice_out + (hard_coded.lo_extendedprice_bw * 32);
+		}
+
+		FLS_LOG("LOADED DATA")
+
+		int* d_lo_orderdate     = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_discount      = loadToGPU<int32_t>(h_enc_lo_discount, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_quantity      = loadToGPU<int32_t>(h_enc_lo_quantity, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_extendedprice = loadToGPU<int32_t>(h_enc_lo_extendedprice, hard_coded.n_tup_line_order, g_allocator);
+
+		FLS_LOG("LOADED DATA TO GPU")
+
+		for (int n = 0; n < num_trials; n++) {
+			auto t =
+			    query<32, 32>(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, query_mtd, g_allocator);
+			FLS_RESULT(t)
+		}
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/fastlanes/src/ssb/fls_q21.cu b/fastlanes/src/ssb/fls_q21.cu
new file mode 100644
index 0000000..a5c2d1a
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q21.cu
@@ -0,0 +1,482 @@
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/unpack/hardcoded_16.cuh"
+#include "fls_gen/unpack/unpack_fused.cuh"
+#include "gpu_utils.h"
+#include "query/query_21.hpp"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <iostream>
+#include <vector>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+inline auto query_mtd = ssb::ssb_q21_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v1(int* lo_orderdate,
+                         int* lo_partkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_p,
+                         int  p_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a tile striped across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int brand[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 32;
+	unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 32;
+	unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, brand, selection_flags, ht_p, p_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 32;
+	unpack_device(lo_orderdate + orderdate_tile_offset, items, query_mtd.ssb.lo_orderdate_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	int revenue_tile_offset = blockIdx.x * query_mtd.ssb.lo_revenue_bw * 32;
+	unpack_device(lo_revenue + revenue_tile_offset, revenue, query_mtd.ssb.lo_revenue_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (brand[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * (5 * 5 * 40));
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = brand[ITEM];
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v2(int* lo_orderdate,
+                         int* lo_partkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_p,
+                         int  p_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a tile striped across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int brand[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, brand, selection_flags, ht_p, p_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, query_mtd.ssb.lo_orderdate_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (brand[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * (5 * 5 * 40));
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = brand[ITEM];
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v3(int* lo_orderdate,
+                         int* lo_partkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_p,
+                         int  p_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a tile striped across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int brand[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, brand, selection_flags, ht_p, p_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, query_mtd.ssb.lo_orderdate_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (brand[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * (5 * 5 * 40));
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = brand[ITEM];
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_p(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         lo_partkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int                          lo_len,
+              int*                         p_partkey,
+              int*                         p_brand1,
+              int*                         p_category,
+              int                          p_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int                          s_len,
+              cub::CachingDeviceAllocator& g_allocator,
+              int                          version) {
+	SETUP_TIMING();
+
+	float time_query;
+
+	cudaEventRecord(start, 0);
+
+	int *ht_d, *ht_p, *ht_s;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * (5 * 5 * 40));
+	int  res_array_size = res_size * ITEMS_PER_THREAD;
+
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	if (version == 1) {
+		probe_v1<BLOCK_THREADS, ITEMS_PER_THREAD><<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+		    lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_p, p_len, ht_d, d_val_len, res);
+	} else if (version == 2) {
+		probe_v2<BLOCK_THREADS, ITEMS_PER_THREAD><<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+		    lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_p, p_len, ht_d, d_val_len, res);
+	} else if (version == 3) {
+		probe_v3<BLOCK_THREADS, ITEMS_PER_THREAD><<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+		    lo_orderdate, lo_partkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_p, p_len, ht_d, d_val_len, res);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	ssb::SSBQuery2ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			result_of_query.emplace_back(
+			    h_res[4 * i], h_res[4 * i + 1], reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q21_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q21_10.reuslt);
+
+	delete[] h_res;
+
+	CLEANUP(res);
+	CLEANUP(ht_d);
+	CLEANUP(ht_p);
+	CLEANUP(ht_s);
+}
+
+int main(int argc, char* argv[]) {
+	int version         = 0;
+	version             = std::stoi(argv[1]);
+	auto hard_coded     = query_mtd.ssb;
+
+	int* h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_partkey   = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_suppkey   = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_revenue   = loadColumn<int>("lo_revenue", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue   = new int[n_vec * 1024];
+
+	auto* orderdate_in = const_cast<int32_t*>(tmp);
+	auto* partkey_in   = const_cast<int32_t*>(h_lo_partkey);
+	auto* suppkey_in   = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in   = const_cast<int32_t*>(h_lo_revenue);
+
+	auto* orderdate_out = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* partkey_out   = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* suppkey_out   = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out   = const_cast<int32_t*>(h_enc_lo_revenue);
+
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_partkey   = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey   = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue;
+
+	if (version == 1) {
+		d_lo_revenue = loadToGPU<int32_t>(h_enc_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	} else if (version == 2 || version == 3) {
+		d_lo_revenue = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+
+	int* h_p_partkey  = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_brand1   = loadColumn<int>("p_brand1", P_LEN);
+	int* h_p_category = loadColumn<int>("p_category", P_LEN);
+
+	int* h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year    = loadColumn<int>("d_year", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey  = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_brand1   = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+	int* d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	if (version == 1) {
+		runQuery<32, 32>(d_lo_orderdate,
+		                 d_lo_partkey,
+		                 d_lo_suppkey,
+		                 d_lo_revenue,
+		                 LO_LEN,
+		                 d_p_partkey,
+		                 d_p_brand1,
+		                 d_p_category,
+		                 P_LEN,
+		                 d_d_datekey,
+		                 d_d_year,
+		                 D_LEN,
+		                 d_s_suppkey,
+		                 d_s_region,
+		                 S_LEN,
+		                 g_allocator,
+		                 version);
+	} else if (version == 2 || version == 3) {
+		runQuery<32, 8>(d_lo_orderdate,
+		                d_lo_partkey,
+		                d_lo_suppkey,
+		                d_lo_revenue,
+		                LO_LEN,
+		                d_p_partkey,
+		                d_p_brand1,
+		                d_p_category,
+		                P_LEN,
+		                d_d_datekey,
+		                d_d_year,
+		                D_LEN,
+		                d_s_suppkey,
+		                d_s_region,
+		                S_LEN,
+		                g_allocator,
+		                version);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+}
diff --git a/fastlanes/src/ssb/fls_q21_bitpacked_opt_v4.cu b/fastlanes/src/ssb/fls_q21_bitpacked_opt_v4.cu
new file mode 100644
index 0000000..67c9d1a
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q21_bitpacked_opt_v4.cu
@@ -0,0 +1,394 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+#define SORTED
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/pack/pack.hpp"
+#include "fls_gen/unpack/hardcoded_16.cuh"
+#include "fls_gen/unpack/unpack_fused.cuh"
+#include "gpu_utils.h"
+#include "query/query_21.hpp"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <iostream>
+#include <vector>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+inline auto query_mtd = ssb::ssb_q21_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v3(int* lo_orderdate,
+                         int* lo_orderdate_bw,
+                         int* lo_orderdate_base,
+                         int* lo_orderdate_offset,
+                         int* lo_partkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_p,
+                         int  p_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	int mtd_offset = blockIdx.x / 4;
+
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a tile striped across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int brand[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, brand, selection_flags, ht_p, p_len, num_tile_items);
+
+	int bw                    = lo_orderdate_bw[mtd_offset];
+	int base                  = lo_orderdate_base[mtd_offset];
+	int orderdate_tile_offset = lo_orderdate_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (brand[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * (5 * 5 * 40));
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = brand[ITEM];
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_p(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         d_lo_orderdate_bw,
+              int*                         d_lo_orderdate_base,
+              int*                         d_lo_orderdate_offset,
+              int*                         lo_partkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int                          lo_len,
+              int*                         p_partkey,
+              int*                         p_brand1,
+              int*                         p_category,
+              int                          p_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int                          s_len,
+              cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float time_query;
+
+	cudaEventRecord(start, 0);
+
+	int *ht_d, *ht_p, *ht_s;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_category, p_partkey, p_brand1, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * (5 * 5 * 40));
+	int  res_array_size = res_size * ITEMS_PER_THREAD;
+
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	probe_v3<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+	                                                                d_lo_orderdate_bw,
+	                                                                d_lo_orderdate_base,
+	                                                                d_lo_orderdate_offset,
+	                                                                lo_partkey,
+	                                                                lo_suppkey,
+	                                                                lo_revenue,
+	                                                                lo_len,
+	                                                                ht_s,
+	                                                                s_len,
+	                                                                ht_p,
+	                                                                p_len,
+	                                                                ht_d,
+	                                                                d_val_len,
+	                                                                res);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	// int res_count = 0;
+	// for (int i = 0; i < res_size; i++) {
+	// 	if (h_res[4 * i] != 0) {
+	// 		cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+	// 		     << reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0] << endl;
+	// 		res_count += 1;
+	// 	}
+	// }
+	//
+	// cout << "Res Count: " << res_count << endl;
+
+	ssb::SSBQuery2ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			result_of_query.emplace_back(
+			    h_res[4 * i], h_res[4 * i + 1], reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q21_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q21_10.reuslt);
+
+	delete[] h_res;
+
+	CLEANUP(res);
+	CLEANUP(ht_d);
+	CLEANUP(ht_p);
+	CLEANUP(ht_s);
+}
+
+int main(int argc, char* argv[]) {
+	auto hard_coded = query_mtd.ssb;
+
+	int* h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_partkey   = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_suppkey   = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_revenue   = loadColumn<int>("lo_revenue", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue   = new int[n_vec * 1024];
+
+	int* h_lo_orderdate_base   = new int[n_vec];
+	int* h_lo_orderdate_bw     = new int[n_vec];
+	int* h_lo_orderdate_offset = new int[n_vec];
+
+	if (!is_sorted(h_lo_orderdate, LO_LEN)) {
+		throw std::runtime_error("should be sorted!");
+	}
+
+	auto* orderdate_in = const_cast<int32_t*>(tmp);
+	auto* partkey_in   = const_cast<int32_t*>(h_lo_partkey);
+	auto* suppkey_in   = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in   = const_cast<int32_t*>(h_lo_revenue);
+
+	auto* orderdate_out  = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* partkey_out   = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* suppkey_out   = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out   = const_cast<int32_t*>(h_enc_lo_revenue);
+
+	constexpr int SF10_LAST_VECTOR_IDX = 58580;
+	constexpr int LAST_VECTOR_SIZE     = 294;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(orderdate_in, LAST_VECTOR_SIZE); }
+
+		h_lo_orderdate_base[vec_idx] = find_base<1024>(orderdate_in);
+		subtract_base<1024>(orderdate_in, h_lo_orderdate_base[vec_idx]);
+		h_lo_orderdate_bw[vec_idx] = find_bw<1024>(orderdate_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_orderdate_offset[vec_idx + 1] = h_lo_orderdate_offset[vec_idx] + (h_lo_orderdate_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_orderdate_bw[vec_idx] > 16) {
+			std::cout << h_lo_orderdate_bw[vec_idx] << " bigger than 16 is not possible in orderdate! \n";
+			exit(-2);
+		}
+
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, h_lo_orderdate_bw[vec_idx]);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (h_lo_orderdate_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_partkey   = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey   = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue   = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+
+	int* d_lo_orderdate_base   = loadToGPU<int32_t>(h_lo_orderdate_base, n_vec, g_allocator);
+	int* d_lo_orderdate_bw     = loadToGPU<int32_t>(h_lo_orderdate_bw, n_vec, g_allocator);
+	int* d_lo_orderdate_offset = loadToGPU<int32_t>(h_lo_orderdate_offset, n_vec, g_allocator);
+
+	int* h_p_partkey  = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_brand1   = loadColumn<int>("p_brand1", P_LEN);
+	int* h_p_category = loadColumn<int>("p_category", P_LEN);
+
+	int* h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year    = loadColumn<int>("d_year", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey  = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_brand1   = loadToGPU<int>(h_p_brand1, P_LEN, g_allocator);
+	int* d_p_category = loadToGPU<int>(h_p_category, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	runQuery<32, 8>(d_lo_orderdate,
+	                d_lo_orderdate_bw,
+	                d_lo_orderdate_base,
+	                d_lo_orderdate_offset,
+	                d_lo_partkey,
+	                d_lo_suppkey,
+	                d_lo_revenue,
+	                LO_LEN,
+	                d_p_partkey,
+	                d_p_brand1,
+	                d_p_category,
+	                P_LEN,
+	                d_d_datekey,
+	                d_d_year,
+	                D_LEN,
+	                d_s_suppkey,
+	                d_s_region,
+	                S_LEN,
+	                g_allocator);
+}
diff --git a/fastlanes/src/ssb/fls_q31.cu b/fastlanes/src/ssb/fls_q31.cu
new file mode 100644
index 0000000..fe89abb
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q31.cu
@@ -0,0 +1,626 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fastlanes/join.cuh"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/unpack/hardcoded_16.cuh>
+#include <iostream>
+#include <query/query_31.hpp>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q31_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v1(int* lo_orderdate,
+                         int* lo_custkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_c,
+                         int  c_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+	static __shared__ int unpacked[1024];
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 32;
+	unpack_device(lo_suppkey + suppkey_tile_offset, unpacked, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 32;
+	unpack_device(lo_custkey + custkey_tile_offset, unpacked, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 32;
+	unpack_device(lo_orderdate + orderdate_tile_offset, unpacked, query_mtd.ssb.lo_orderdate_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	int revenue_tile_offset = blockIdx.x * query_mtd.ssb.lo_revenue_bw * 32;
+	unpack_device(lo_revenue + revenue_tile_offset, unpacked, query_mtd.ssb.lo_revenue_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, revenue, num_tile_items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash = (s_nation[ITEM] * 25 * 7 + c_nation[ITEM] * 7 + (year[ITEM] - 1992)) %
+				           ((1998 - 1992 + 1) * 25 * 25);
+				res[hash * 6]     = year[ITEM];
+				res[hash * 6 + 1] = c_nation[ITEM];
+				res[hash * 6 + 2] = s_nation[ITEM];
+				/*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 6 + 4]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v2(int* lo_orderdate,
+                         int* lo_custkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_c,
+                         int  c_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	// int c_nation[ITEMS_PER_THREAD];
+	static __shared__ int shared_4_c_nation[1024];
+	// int s_nation[ITEMS_PER_THREAD];
+	static __shared__ int shared_3_s_nation[1024];
+	// int year[ITEMS_PER_THREAD];
+	static __shared__ int shared_2_year[1024];
+	// int revenue[ITEMS_PER_THREAD];
+	static __shared__ int shared_1_revenue[1024];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 32;
+	unpack_device(lo_suppkey + suppkey_tile_offset, shared_1_revenue, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(shared_1_revenue, items, num_tile_items);
+	BlockProbeAndPHT_2_R_S<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, shared_3_s_nation, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 32;
+	unpack_device(lo_custkey + custkey_tile_offset, shared_1_revenue, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(shared_1_revenue, items, num_tile_items);
+	BlockProbeAndPHT_2_R_S<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, shared_4_c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 32;
+	unpack_device(lo_orderdate + orderdate_tile_offset, shared_1_revenue, query_mtd.ssb.lo_orderdate_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(shared_1_revenue, items, num_tile_items);
+	BlockProbeAndPHT_2_R_S<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, shared_2_year, selection_flags, ht_d, d_len, int(0), num_tile_items);
+
+	int revenue_tile_offset = blockIdx.x * query_mtd.ssb.lo_revenue_bw * 32;
+	unpack_device(lo_revenue + revenue_tile_offset, shared_1_revenue, query_mtd.ssb.lo_revenue_bw);
+	// BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, revenue, num_tile_items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		auto shared_idx = BLOCK_THREADS * ITEM + threadIdx.x;
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash = (shared_3_s_nation[shared_idx] * 25 * 7 + shared_4_c_nation[shared_idx] * 7 +
+				            (shared_2_year[shared_idx] - 1992)) %
+				           ((1998 - 1992 + 1) * 25 * 25);
+				res[hash * 6]     = shared_2_year[shared_idx];
+				res[hash * 6 + 1] = shared_4_c_nation[shared_idx];
+				res[hash * 6 + 2] = shared_3_s_nation[shared_idx];
+				/*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 6 + 4]),
+				          (long long)(shared_1_revenue[shared_idx]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v3(int* lo_orderdate,
+                         int* lo_custkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_c,
+                         int  c_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, query_mtd.ssb.lo_orderdate_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash = (s_nation[ITEM] * 25 * 7 + c_nation[ITEM] * 7 + (year[ITEM] - 1992)) %
+				           ((1998 - 1992 + 1) * 25 * 25);
+				res[hash * 6]     = year[ITEM];
+				res[hash * 6 + 1] = c_nation[ITEM];
+				res[hash * 6 + 2] = s_nation[ITEM];
+				/*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 6 + 4]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v4(int* lo_orderdate,
+                         int* lo_custkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_c,
+                         int  c_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, query_mtd.ssb.lo_orderdate_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	// BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash = (s_nation[ITEM] * 25 * 7 + c_nation[ITEM] * 7 + (year[ITEM] - 1992)) %
+				           ((1998 - 1992 + 1) * 25 * 25);
+				res[hash * 6]     = year[ITEM];
+				res[hash * 6 + 1] = c_nation[ITEM];
+				res[hash * 6 + 2] = s_nation[ITEM];
+				/*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 6 + 4]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_s(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+	BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1992, selection_flags, num_tile_items);
+	BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1997, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items2, items, selection_flags, hash_table, num_slots, 19920101, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         lo_custkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int                          lo_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int*                         s_nation,
+              int                          s_len,
+              int*                         c_custkey,
+              int*                         c_region,
+              int*                         c_nation,
+              int                          c_len,
+              cub::CachingDeviceAllocator& g_allocator,
+              int                          version) {
+
+	int *ht_d, *ht_c, *ht_s;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25 * 25);
+	int  res_array_size = res_size * 6;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	if (version == 1) {
+		probe_v1<BLOCK_THREADS, ITEMS_PER_THREAD><<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+		    lo_orderdate, lo_custkey, lo_suppkey, lo_revenue, lo_len, ht_s, s_len, ht_c, c_len, ht_d, d_val_len, res);
+	} else if (version == 2) {
+		if constexpr (ITEMS_PER_THREAD == 32) {
+			probe_v2<BLOCK_THREADS, ITEMS_PER_THREAD>
+			    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+			                                                                lo_custkey,
+			                                                                lo_suppkey,
+			                                                                lo_revenue,
+			                                                                lo_len,
+			                                                                ht_s,
+			                                                                s_len,
+			                                                                ht_c,
+			                                                                c_len,
+			                                                                ht_d,
+			                                                                d_val_len,
+			                                                                res);
+		}
+	} else if (version == 3) {
+		if constexpr (ITEMS_PER_THREAD == 8) {
+			probe_v3<BLOCK_THREADS, ITEMS_PER_THREAD>
+			    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+			                                                                lo_custkey,
+			                                                                lo_suppkey,
+			                                                                lo_revenue,
+			                                                                lo_len,
+			                                                                ht_s,
+			                                                                s_len,
+			                                                                ht_c,
+			                                                                c_len,
+			                                                                ht_d,
+			                                                                d_val_len,
+			                                                                res);
+		}
+	} else if (version == 4) {
+		if constexpr (ITEMS_PER_THREAD == 8) {
+			probe_v4<BLOCK_THREADS, ITEMS_PER_THREAD>
+			    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+			                                                                lo_custkey,
+			                                                                lo_suppkey,
+			                                                                lo_revenue,
+			                                                                lo_len,
+			                                                                ht_s,
+			                                                                s_len,
+			                                                                ht_c,
+			                                                                c_len,
+			                                                                ht_d,
+			                                                                d_val_len,
+			                                                                res);
+		}
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	ssb::SSBQuery3ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[6 * i] != 0) {
+			result_of_query.emplace_back(h_res[6 * i],
+			                             h_res[6 * i + 1],
+			                             h_res[6 * i + 2],
+			                             reinterpret_cast<unsigned long long*>(&h_res[6 * i + 4])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q31_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q31_10.reuslt);
+
+	delete[] h_res;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char* argv[]) {
+	int version = 0;
+	version     = std::stoi(argv[1]);
+
+	auto hard_coded     = query_mtd.ssb;
+	int* h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_custkey   = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_suppkey   = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_revenue   = loadColumn<int>("lo_revenue", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+	const int* h_enc_lo_custkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue   = new int[n_vec * 1024];
+
+	auto* orderdate_in = const_cast<const int32_t*>(tmp);
+	auto* custkey_in   = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in   = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in   = const_cast<int32_t*>(h_lo_revenue);
+
+	auto* orderdate_out = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out   = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out   = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out   = const_cast<int32_t*>(h_enc_lo_revenue);
+
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, hard_coded.lo_chosen_custkey_bw);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (hard_coded.lo_chosen_custkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey   = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey   = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue;
+
+	if (version == 1 || version == 2) {
+		d_lo_revenue = loadToGPU<int32_t>(h_enc_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	} else if (version == 3 || version == 4) {
+		d_lo_revenue = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+
+	int* h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year    = loadColumn<int>("d_year", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_nation  = loadColumn<int>("s_nation", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+	int* d_s_nation  = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	// Run
+	if (version == 1 || version == 2) {
+		runQuery<32, 32>(d_lo_orderdate,
+		                 d_lo_custkey,
+		                 d_lo_suppkey,
+		                 d_lo_revenue,
+		                 LO_LEN,
+		                 d_d_datekey,
+		                 d_d_year,
+		                 D_LEN,
+		                 d_s_suppkey,
+		                 d_s_region,
+		                 d_s_nation,
+		                 S_LEN,
+		                 d_c_custkey,
+		                 d_c_region,
+		                 d_c_nation,
+		                 C_LEN,
+		                 g_allocator,
+		                 version);
+	} else if (version == 3 || version == 4) {
+		runQuery<32, 8>(d_lo_orderdate,
+		                d_lo_custkey,
+		                d_lo_suppkey,
+		                d_lo_revenue,
+		                LO_LEN,
+		                d_d_datekey,
+		                d_d_year,
+		                D_LEN,
+		                d_s_suppkey,
+		                d_s_region,
+		                d_s_nation,
+		                S_LEN,
+		                d_c_custkey,
+		                d_c_region,
+		                d_c_nation,
+		                C_LEN,
+		                g_allocator,
+		                version);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+}
\ No newline at end of file
diff --git a/fastlanes/src/ssb/fls_q31_bitpacked_opt_v5.cu b/fastlanes/src/ssb/fls_q31_bitpacked_opt_v5.cu
new file mode 100644
index 0000000..e853c49
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q31_bitpacked_opt_v5.cu
@@ -0,0 +1,393 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+#define SORTED
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fastlanes/join.cuh"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/unpack/hardcoded_16.cuh>
+#include <iostream>
+#include <query/query_31.hpp>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q31_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v4(int* lo_orderdate,
+                         int* lo_orderdate_bw,
+                         int* lo_orderdate_base,
+                         int* lo_orderdate_offset,
+                         int* lo_custkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int  lo_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_c,
+                         int  c_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+
+	int mtd_offset = blockIdx.x / 4;
+
+	constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, s_nation, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int bw                    = lo_orderdate_bw[mtd_offset];
+	int base                  = lo_orderdate_base[mtd_offset];
+	int orderdate_tile_offset = lo_orderdate_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	// BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_revenue + tile_offset, revenue, num_tile_items);
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM)) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash = (s_nation[ITEM] * 25 * 7 + c_nation[ITEM] * 7 + (year[ITEM] - 1992)) %
+				           ((1998 - 1992 + 1) * 25 * 25);
+				res[hash * 6]     = year[ITEM];
+				res[hash * 6 + 1] = c_nation[ITEM];
+				res[hash * 6 + 2] = s_nation[ITEM];
+				/*atomicAdd(&res[hash * 6 + 4], revenue[ITEM]);*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 6 + 4]), (long long)(revenue[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_s(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 2, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items, num_tile_items);
+	BlockPredGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1992, selection_flags, num_tile_items);
+	BlockPredLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1997, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items2, items, selection_flags, hash_table, num_slots, 19920101, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         d_lo_orderdate_bw,
+              int*                         d_lo_orderdate_base,
+              int*                         d_lo_orderdate_offset,
+              int*                         lo_custkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int                          lo_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int*                         s_nation,
+              int                          s_len,
+              int*                         c_custkey,
+              int*                         c_region,
+              int*                         c_nation,
+              int                          c_len,
+              cub::CachingDeviceAllocator& g_allocator) {
+
+	int *ht_d, *ht_c, *ht_s;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_nation, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25 * 25);
+	int  res_array_size = res_size * 6;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	probe_v4<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+	                                                                d_lo_orderdate_bw,
+	                                                                d_lo_orderdate_base,
+	                                                                d_lo_orderdate_offset,
+	                                                                lo_custkey,
+	                                                                lo_suppkey,
+	                                                                lo_revenue,
+	                                                                lo_len,
+	                                                                ht_s,
+	                                                                s_len,
+	                                                                ht_c,
+	                                                                c_len,
+	                                                                ht_d,
+	                                                                d_val_len,
+	                                                                res);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	ssb::SSBQuery3ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[6 * i] != 0) {
+			result_of_query.emplace_back(h_res[6 * i],
+			                             h_res[6 * i + 1],
+			                             h_res[6 * i + 2],
+			                             reinterpret_cast<unsigned long long*>(&h_res[6 * i + 4])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q31_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q31_10.reuslt);
+
+	delete[] h_res;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char* argv[]) {
+	auto hard_coded     = query_mtd.ssb;
+	int* h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_custkey   = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_suppkey   = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_revenue   = loadColumn<int>("lo_revenue", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+	const int* h_enc_lo_custkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey   = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue   = new int[n_vec * 1024];
+
+	int* h_lo_orderdate_base   = new int[n_vec];
+	int* h_lo_orderdate_bw     = new int[n_vec];
+	int* h_lo_orderdate_offset = new int[n_vec];
+
+	auto* orderdate_in = const_cast<int32_t*>(tmp);
+	auto* custkey_in   = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in   = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in   = const_cast<int32_t*>(h_lo_revenue);
+
+	auto* orderdate_out = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out   = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out   = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out   = const_cast<int32_t*>(h_enc_lo_revenue);
+
+	constexpr int SF10_LAST_VECTOR_IDX = 58580;
+	constexpr int LAST_VECTOR_SIZE     = 294;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(orderdate_in, LAST_VECTOR_SIZE); }
+
+		h_lo_orderdate_base[vec_idx] = find_base<1024>(orderdate_in);
+		subtract_base<1024>(orderdate_in, h_lo_orderdate_base[vec_idx]);
+		h_lo_orderdate_bw[vec_idx] = find_bw<1024>(orderdate_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_orderdate_offset[vec_idx + 1] = h_lo_orderdate_offset[vec_idx] + (h_lo_orderdate_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_orderdate_bw[vec_idx] > 16) {
+			std::cout << h_lo_orderdate_bw[vec_idx] << " bigger than 16 is not possible in orderdate! \n";
+			exit(-2);
+		}
+
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, h_lo_orderdate_bw[vec_idx]);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (h_lo_orderdate_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, hard_coded.lo_chosen_custkey_bw);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (hard_coded.lo_chosen_custkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey   = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey   = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue;
+
+	d_lo_revenue = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+
+	int* h_d_datekey = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year    = loadColumn<int>("d_year", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_nation  = loadColumn<int>("s_nation", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+	int* d_s_nation  = loadToGPU<int>(h_s_nation, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	int* d_lo_orderdate_base   = loadToGPU<int32_t>(h_lo_orderdate_base, n_vec, g_allocator);
+	int* d_lo_orderdate_bw     = loadToGPU<int32_t>(h_lo_orderdate_bw, n_vec, g_allocator);
+	int* d_lo_orderdate_offset = loadToGPU<int32_t>(h_lo_orderdate_offset, n_vec, g_allocator);
+
+	// Run
+	runQuery<32, 8>(d_lo_orderdate,
+	                d_lo_orderdate_bw,
+	                d_lo_orderdate_base,
+	                d_lo_orderdate_offset,
+	                d_lo_custkey,
+	                d_lo_suppkey,
+	                d_lo_revenue,
+	                LO_LEN,
+	                d_d_datekey,
+	                d_d_year,
+	                D_LEN,
+	                d_s_suppkey,
+	                d_s_region,
+	                d_s_nation,
+	                S_LEN,
+	                d_c_custkey,
+	                d_c_region,
+	                d_c_nation,
+	                C_LEN,
+	                g_allocator);
+}
\ No newline at end of file
diff --git a/fastlanes/src/ssb/fls_q41.cu b/fastlanes/src/ssb/fls_q41.cu
new file mode 100644
index 0000000..1b33bf8
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q41.cu
@@ -0,0 +1,603 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/hardcoded_16.cuh"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <query/query_41.hpp>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q41_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v1(int* lo_orderdate,
+                         int* lo_partkey,
+                         int* lo_custkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int* lo_supplycost,
+                         int  lo_len,
+                         int* ht_p,
+                         int  p_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_c,
+                         int  c_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	// Load a segment of consecutive items that are blocked across threads
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	static __shared__ int unpacked[1024];
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	// int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 32;
+	unpack_device(lo_suppkey + suppkey_tile_offset, unpacked, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 32;
+	unpack_device(lo_custkey + custkey_tile_offset, unpacked, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 32;
+	unpack_device(lo_partkey + partkey_tile_offset, unpacked, query_mtd.ssb.lo_partkey_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 32;
+	unpack_device(lo_orderdate + orderdate_tile_offset, unpacked, query_mtd.ssb.lo_orderdate_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	int revenue_tile_offset = blockIdx.x * query_mtd.ssb.lo_revenue_bw * 32;
+	unpack_device(lo_revenue + revenue_tile_offset, unpacked, query_mtd.ssb.lo_revenue_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, revenue, num_tile_items);
+
+	int supplycost_tile_offset = blockIdx.x * query_mtd.ssb.lo_supplycost_bw * 32;
+	unpack_device(lo_supplycost + supplycost_tile_offset, unpacked, query_mtd.ssb.lo_supplycost_bw);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(unpacked, items, num_tile_items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * 25);
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = c_nation[ITEM];
+				/*atomicAdd(&res[hash * 4 + 2], (1));*/
+				/*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+				          (long long)(revenue[ITEM] - items[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe_v2(int* lo_orderdate,
+                         int* lo_partkey,
+                         int* lo_custkey,
+                         int* lo_suppkey,
+                         int* lo_revenue,
+                         int* lo_supplycost,
+                         int  lo_len,
+                         int* ht_p,
+                         int  p_len,
+                         int* ht_s,
+                         int  s_len,
+                         int* ht_c,
+                         int  c_len,
+                         int* ht_d,
+                         int  d_len,
+                         int* res) {
+	// Load a segment of consecutive items that are blocked across threads
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	// int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, query_mtd.ssb.lo_orderdate_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+	int supplycost_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_supplycost_bw * 8;
+	unpack_8_at_a_time::unpack_device(
+	    lo_supplycost + supplycost_tile_offset, items, query_mtd.ssb.lo_chosen_supplycost_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * 25);
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = c_nation[ITEM];
+				/*atomicAdd(&res[hash * 4 + 2], (1));*/
+				/*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+				          (long long)(revenue[ITEM] - items[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+	BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         lo_custkey,
+              int*                         lo_partkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int*                         lo_supplycost,
+              int                          lo_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         p_partkey,
+              int*                         p_mfgr,
+              int                          p_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int                          s_len,
+              int*                         c_custkey,
+              int*                         c_region,
+              int*                         c_nation,
+              int                          c_len,
+              cub::CachingDeviceAllocator& g_allocator,
+              int                          version) {
+	int *ht_d, *ht_c, *ht_s, *ht_p;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	int* s_res = new int[s_len * 2];
+	CubDebugExit(cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int* c_res = new int[c_len * 2];
+	CubDebugExit(cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_mfgr, p_partkey, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int* p_res = new int[p_len * 2];
+	CubDebugExit(cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+#if 0
+	int *h_ht_s = new int[s_len * 2];
+	int *h_ht_c = new int[c_len * 2];
+	int *h_ht_p = new int[p_len * 2];
+	int *h_ht_d = new int[d_val_len * 2];
+
+	int num_s = 0 , num_c = 0, num_d = 0, num_p = 0;
+
+	CubDebugExit(cudaMemcpy(h_ht_s, ht_s, 2 * s_len * sizeof(int), cudaMemcpyDeviceToHost));
+	for (int i=0; i<s_len; i++) if (h_ht_s[i*2] != 0) num_s += 1;
+
+	cout << "Num Matched" << " " << num_s << " " << s_len << endl;
+
+	CubDebugExit(cudaMemcpy(h_ht_d, ht_d, 2 * d_val_len * sizeof(int), cudaMemcpyDeviceToHost));
+	for (int i=0; i<d_val_len; i++) if (h_ht_d[i*2] != 0) num_d += 1;
+
+	cout << "Num Matched" << " " << num_d << " " << d_len << endl;
+
+	CubDebugExit(cudaMemcpy(h_ht_c, ht_c, 2 * c_len * sizeof(int), cudaMemcpyDeviceToHost));
+	for (int i=0; i<c_len; i++) if (h_ht_c[i*2] != 0) num_c += 1;
+
+	cout << "Num Matched" << " " << num_c << " " << c_len << endl;
+
+	CubDebugExit(cudaMemcpy(h_ht_p, ht_p, 2 * p_len * sizeof(int), cudaMemcpyDeviceToHost));
+	for (int i=0; i<p_len; i++) if (h_ht_p[i*2] != 0) num_p += 1;
+
+	cout << "Num Matched" << " " << num_p << " " << p_len << endl;
+#endif
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25);
+	int  ht_entries     = 4; // int,int,long long
+	int  res_array_size = res_size * ht_entries;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	if (version == 1) {
+		probe_v1<BLOCK_THREADS, ITEMS_PER_THREAD>
+		    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+		                                                                lo_partkey,
+		                                                                lo_custkey,
+		                                                                lo_suppkey,
+		                                                                lo_revenue,
+		                                                                lo_supplycost,
+		                                                                lo_len,
+		                                                                ht_p,
+		                                                                p_len,
+		                                                                ht_s,
+		                                                                s_len,
+		                                                                ht_c,
+		                                                                c_len,
+		                                                                ht_d,
+		                                                                d_val_len,
+		                                                                res);
+	} else if (version == 2) {
+		probe_v2<BLOCK_THREADS, ITEMS_PER_THREAD>
+		    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+		                                                                lo_partkey,
+		                                                                lo_custkey,
+		                                                                lo_suppkey,
+		                                                                lo_revenue,
+		                                                                lo_supplycost,
+		                                                                lo_len,
+		                                                                ht_p,
+		                                                                p_len,
+		                                                                ht_s,
+		                                                                s_len,
+		                                                                ht_c,
+		                                                                c_len,
+		                                                                ht_d,
+		                                                                d_val_len,
+		                                                                res);
+	} else {
+		throw std::runtime_error("this version does not exit.");
+	}
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	ssb::SSBQuery4ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			result_of_query.emplace_back(
+			    h_res[4 * i], h_res[4 * i + 1], reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q41_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q41_10.reuslt);
+	delete[] h_res;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char* argv[]) {
+	/*
+	 *     - v2 : 8 value at a time + predicate load on uncompressed data
+	 */
+	int version = 0;
+	version     = std::stoi(argv[1]);
+
+	auto hard_coded      = query_mtd.ssb;
+	int* h_lo_orderdate  = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_suppkey    = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_custkey    = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_partkey    = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_revenue    = loadColumn<int>("lo_revenue", LO_LEN);
+	int* h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	const int* h_enc_lo_orderdate  = new int[n_vec * 1024];
+	const int* h_enc_lo_custkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue    = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_supplycost = new int[n_vec * 1024];
+
+	auto* orderdate_in  = const_cast<const int32_t*>(tmp);
+	auto* custkey_in    = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in    = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in    = const_cast<int32_t*>(h_lo_revenue);
+	auto* partkey_in    = const_cast<int32_t*>(h_lo_partkey);
+	auto* supplycost_in = const_cast<int32_t*>(h_lo_supplycost);
+
+	auto* orderdate_out  = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out    = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out    = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out    = const_cast<int32_t*>(h_enc_lo_revenue);
+	auto* partkey_out    = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* supplycost_out = const_cast<int32_t*>(h_enc_lo_supplycost);
+
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		if (version == 1) {
+			generated::pack::fallback::scalar::pack(supplycost_in, supplycost_out, hard_coded.lo_supplycost_bw);
+			supplycost_in  = supplycost_in + 1024;
+			supplycost_out = supplycost_out + (hard_coded.lo_supplycost_bw * 32);
+		} else if (version == 2) {
+			generated::pack::fallback::scalar::pack(supplycost_in, supplycost_out, hard_coded.lo_chosen_supplycost_bw);
+			supplycost_in  = supplycost_in + 1024;
+			supplycost_out = supplycost_out + (hard_coded.lo_chosen_supplycost_bw * 32);
+		} else {
+			throw std::runtime_error("this version is not supported");
+		}
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, hard_coded.lo_chosen_custkey_bw);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (hard_coded.lo_chosen_custkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey   = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey   = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue;
+	int* d_lo_partkey    = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_supplycost = loadToGPU<int32_t>(h_enc_lo_supplycost, hard_coded.n_tup_line_order, g_allocator);
+
+	if (version == 1) {
+		d_lo_revenue = loadToGPU<int32_t>(h_enc_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	} else if (version == 2) {
+		d_lo_revenue = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	} else {
+		throw std::runtime_error("this version does not exist.");
+	}
+
+	int* h_d_datekey      = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year         = loadColumn<int>("d_year", D_LEN);
+	int* h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_mfgr    = loadColumn<int>("p_mfgr", P_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_mfgr    = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	if (version == 1) {
+		runQuery<32, 32>(d_lo_orderdate,
+		                 d_lo_custkey,
+		                 d_lo_partkey,
+		                 d_lo_suppkey,
+		                 d_lo_revenue,
+		                 d_lo_supplycost,
+		                 LO_LEN,
+		                 d_d_datekey,
+		                 d_d_year,
+		                 D_LEN,
+		                 d_p_partkey,
+		                 d_p_mfgr,
+		                 P_LEN,
+		                 d_s_suppkey,
+		                 d_s_region,
+		                 S_LEN,
+		                 d_c_custkey,
+		                 d_c_region,
+		                 d_c_nation,
+		                 C_LEN,
+		                 g_allocator,
+		                 version);
+	} else if (version == 2) {
+		runQuery<32, 8>(d_lo_orderdate,
+		                d_lo_custkey,
+		                d_lo_partkey,
+		                d_lo_suppkey,
+		                d_lo_revenue,
+		                d_lo_supplycost,
+		                LO_LEN,
+		                d_d_datekey,
+		                d_d_year,
+		                D_LEN,
+		                d_p_partkey,
+		                d_p_mfgr,
+		                P_LEN,
+		                d_s_suppkey,
+		                d_s_region,
+		                S_LEN,
+		                d_c_custkey,
+		                d_c_region,
+		                d_c_nation,
+		                C_LEN,
+		                g_allocator,
+		                version);
+	} else {
+		throw std::runtime_error("this version does not exist");
+	}
+
+	return 0;
+}
diff --git a/fastlanes/src/ssb/fls_q41_bitpacked_opt_v3.cu b/fastlanes/src/ssb/fls_q41_bitpacked_opt_v3.cu
new file mode 100644
index 0000000..29763ef
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q41_bitpacked_opt_v3.cu
@@ -0,0 +1,531 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+#define SORTED
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/unpack/hardcoded_16.cuh>
+#include <iostream>
+#include <query/query_41.hpp>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q41_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate,
+                      int* lo_orderdate_bw,
+                      int* lo_orderdate_base,
+                      int* lo_orderdate_offset,
+                      int* lo_partkey,
+                      int* lo_custkey,
+                      int* lo_suppkey,
+                      int* lo_revenue,
+                      int* lo_supplycost,
+                      int  lo_len,
+                      int* ht_p,
+                      int  p_len,
+                      int* ht_s,
+                      int  s_len,
+                      int* ht_c,
+                      int  c_len,
+                      int* ht_d,
+                      int  d_len,
+                      int* res) {
+	// Load a segment of consecutive items that are blocked across threads
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	// int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	int mtd_offset = blockIdx.x / 4;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+
+	int bw                    = lo_orderdate_bw[mtd_offset];
+	int base                  = lo_orderdate_base[mtd_offset];
+	int orderdate_tile_offset = lo_orderdate_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+	int supplycost_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_supplycost_bw * 8;
+	unpack_8_at_a_time::unpack_device(
+	    lo_supplycost + supplycost_tile_offset, items, query_mtd.ssb.lo_chosen_supplycost_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * 25);
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = c_nation[ITEM];
+				/*atomicAdd(&res[hash * 4 + 2], (1));*/
+				/*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+				          (long long)(revenue[ITEM] - items[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+	BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+               int*                         lo_orderdate_bw,
+               int*                         lo_orderdate_base,
+               int*                         lo_orderdate_offset,
+               int*                         lo_custkey,
+               int*                         lo_partkey,
+               int*                         lo_suppkey,
+               int*                         lo_revenue,
+               int*                         lo_supplycost,
+               int                          lo_len,
+               int*                         d_datekey,
+               int*                         d_year,
+               int                          d_len,
+               int*                         p_partkey,
+               int*                         p_mfgr,
+               int                          p_len,
+               int*                         s_suppkey,
+               int*                         s_region,
+               int                          s_len,
+               int*                         c_custkey,
+               int*                         c_region,
+               int*                         c_nation,
+               int                          c_len,
+               cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+
+	cudaEventRecord(start, 0);
+
+	int *ht_d, *ht_c, *ht_s, *ht_p;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	int* s_res = new int[s_len * 2];
+	CubDebugExit(cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int* c_res = new int[c_len * 2];
+	CubDebugExit(cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_mfgr, p_partkey, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int* p_res = new int[p_len * 2];
+	CubDebugExit(cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+#if 0
+  int *h_ht_s = new int[s_len * 2];
+  int *h_ht_c = new int[c_len * 2];
+  int *h_ht_p = new int[p_len * 2];
+  int *h_ht_d = new int[d_val_len * 2];
+
+  int num_s = 0 , num_c = 0, num_d = 0, num_p = 0;
+
+  CubDebugExit(cudaMemcpy(h_ht_s, ht_s, 2 * s_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<s_len; i++) if (h_ht_s[i*2] != 0) num_s += 1;
+
+  cout << "Num Matched" << " " << num_s << " " << s_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_d, ht_d, 2 * d_val_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<d_val_len; i++) if (h_ht_d[i*2] != 0) num_d += 1;
+
+  cout << "Num Matched" << " " << num_d << " " << d_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_c, ht_c, 2 * c_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<c_len; i++) if (h_ht_c[i*2] != 0) num_c += 1;
+
+  cout << "Num Matched" << " " << num_c << " " << c_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_p, ht_p, 2 * p_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<p_len; i++) if (h_ht_p[i*2] != 0) num_p += 1;
+
+  cout << "Num Matched" << " " << num_p << " " << p_len << endl;
+#endif
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25);
+	int  ht_entries     = 4; // int,int,long long
+	int  res_array_size = res_size * ht_entries;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	probe<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+	                                                                lo_orderdate_bw,
+	                                                                lo_orderdate_base,
+	                                                                lo_orderdate_offset,
+	                                                                lo_partkey,
+	                                                                lo_custkey,
+	                                                                lo_suppkey,
+	                                                                lo_revenue,
+	                                                                lo_supplycost,
+	                                                                lo_len,
+	                                                                ht_p,
+	                                                                p_len,
+	                                                                ht_s,
+	                                                                s_len,
+	                                                                ht_c,
+	                                                                c_len,
+	                                                                ht_d,
+	                                                                d_val_len,
+	                                                                res);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	// cout << "Result:" << endl;
+	// int res_count = 0;
+	// for (int i = 0; i < res_size; i++) {
+	// 	if (h_res[4 * i] != 0) {
+	// 		cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+	// 		     << reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0] << endl;
+	// 		res_count += 1;
+	// 	}
+	// }
+	//
+	// cout << "Res Count: " << res_count << endl;
+
+	ssb::SSBQuery4ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			result_of_query.emplace_back(
+			    h_res[4 * i], h_res[4 * i + 1], reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q41_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q41_10.reuslt);
+
+	delete[] h_res;
+}
+
+/**
+ * Main
+ */
+int main() {
+	auto hard_coded      = query_mtd.ssb;
+	int* h_lo_orderdate  = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_suppkey    = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_custkey    = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_partkey    = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_revenue    = loadColumn<int>("lo_revenue", LO_LEN);
+	int* h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	if (!is_sorted(h_lo_orderdate, LO_LEN)) {
+		throw std::runtime_error("should be sorted!");
+	}
+
+	int* h_lo_orderdate_base   = new int[n_vec];
+	int* h_lo_orderdate_bw     = new int[n_vec];
+	int* h_lo_orderdate_offset = new int[n_vec];
+
+	const int* h_enc_lo_orderdate  = new int[n_vec * 1024];
+	const int* h_enc_lo_custkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue    = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_supplycost = new int[n_vec * 1024];
+
+	auto* orderdate_in  = const_cast<int32_t*>(tmp);
+	auto* custkey_in    = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in    = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in    = const_cast<int32_t*>(h_lo_revenue);
+	auto* partkey_in    = const_cast<int32_t*>(h_lo_partkey);
+	auto* supplycost_in = const_cast<int32_t*>(h_lo_supplycost);
+
+	auto* orderdate_out  = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out    = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out    = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out    = const_cast<int32_t*>(h_enc_lo_revenue);
+	auto* partkey_out    = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* supplycost_out = const_cast<int32_t*>(h_enc_lo_supplycost);
+
+	constexpr int SF10_LAST_VECTOR_IDX = 58580;
+	constexpr int LAST_VECTOR_SIZE     = 294;
+
+	h_lo_orderdate_offset[0] = 0;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(custkey_in, LAST_VECTOR_SIZE); }
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(orderdate_in, LAST_VECTOR_SIZE); }
+
+		h_lo_orderdate_base[vec_idx] = find_base<1024>(orderdate_in);
+		subtract_base<1024>(orderdate_in, h_lo_orderdate_base[vec_idx]);
+		h_lo_orderdate_bw[vec_idx] = find_bw<1024>(orderdate_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_orderdate_offset[vec_idx + 1] = h_lo_orderdate_offset[vec_idx] + (h_lo_orderdate_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_orderdate_bw[vec_idx] > 16) {
+			std::cout << h_lo_orderdate_bw[vec_idx] << " bigger than 16 is not possible in orderdate! \n";
+			exit(-2);
+		}
+
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, h_lo_orderdate_bw[vec_idx]);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (h_lo_orderdate_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(supplycost_in, supplycost_out, hard_coded.lo_chosen_supplycost_bw);
+		supplycost_in  = supplycost_in + 1024;
+		supplycost_out = supplycost_out + (hard_coded.lo_chosen_supplycost_bw * 32);
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, hard_coded.lo_chosen_custkey_bw);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (hard_coded.lo_chosen_custkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate  = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey    = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey    = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue    = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_partkey    = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_supplycost = loadToGPU<int32_t>(h_enc_lo_supplycost, hard_coded.n_tup_line_order, g_allocator);
+
+	int* h_d_datekey      = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year         = loadColumn<int>("d_year", D_LEN);
+	int* h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_mfgr    = loadColumn<int>("p_mfgr", P_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+
+	int* d_lo_orderdate_base   = loadToGPU<int32_t>(h_lo_orderdate_base, n_vec, g_allocator);
+	int* d_lo_orderdate_bw     = loadToGPU<int32_t>(h_lo_orderdate_bw, n_vec, g_allocator);
+	int* d_lo_orderdate_offset = loadToGPU<int32_t>(h_lo_orderdate_offset, n_vec, g_allocator);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_mfgr    = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	cout << "** LOADED DATA TO GPU **" << endl;
+
+	runQuery<32, 8>(d_lo_orderdate,
+	                d_lo_orderdate_bw,
+	                d_lo_orderdate_base,
+	                d_lo_orderdate_offset,
+	                d_lo_custkey,
+	                d_lo_partkey,
+	                d_lo_suppkey,
+	                d_lo_revenue,
+	                d_lo_supplycost,
+	                LO_LEN,
+	                d_d_datekey,
+	                d_d_year,
+	                D_LEN,
+	                d_p_partkey,
+	                d_p_mfgr,
+	                P_LEN,
+	                d_s_suppkey,
+	                d_s_region,
+	                S_LEN,
+	                d_c_custkey,
+	                d_c_region,
+	                d_c_nation,
+	                C_LEN,
+	                g_allocator);
+
+	return 0;
+}
diff --git a/fastlanes/src/ssb/fls_q41_bitpacked_opt_v4.cu b/fastlanes/src/ssb/fls_q41_bitpacked_opt_v4.cu
new file mode 100644
index 0000000..4b31103
--- /dev/null
+++ b/fastlanes/src/ssb/fls_q41_bitpacked_opt_v4.cu
@@ -0,0 +1,538 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+#define SORTED
+
+#include "crystal/crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "gtest/gtest.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/unpack/hardcoded_16.cuh>
+#include <iostream>
+#include <query/query_41.hpp>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q41_10;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate,
+                      int* lo_orderdate_bw,
+                      int* lo_orderdate_base,
+                      int* lo_orderdate_offset,
+                      int* lo_partkey,
+                      int* lo_custkey,
+                      int* lo_custkey_bw,
+                      int* lo_custkey_base,
+                      int* lo_custkey_offset,
+                      int* lo_suppkey,
+                      int* lo_revenue,
+                      int* lo_supplycost,
+                      int  lo_len,
+                      int* ht_p,
+                      int  p_len,
+                      int* ht_s,
+                      int  s_len,
+                      int* ht_c,
+                      int  c_len,
+                      int* ht_d,
+                      int  d_len,
+                      int* res) {
+	// Load a segment of consecutive items that are blocked across threads
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	// int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	int mtd_offset = blockIdx.x / 4;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+
+	int bw                  = lo_custkey_bw[mtd_offset];
+	int base                = lo_custkey_base[mtd_offset];
+	int custkey_tile_offset = lo_custkey_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+
+	bw                        = lo_orderdate_bw[mtd_offset];
+	base                      = lo_orderdate_base[mtd_offset];
+	int orderdate_tile_offset = lo_orderdate_offset[mtd_offset] + (blockIdx.x % 4) * bw * 8;
+
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, bw);
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		items[ITEM] = items[ITEM] + base;
+	}
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+	int supplycost_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_supplycost_bw * 8;
+	unpack_8_at_a_time::unpack_device(
+	    lo_supplycost + supplycost_tile_offset, items, query_mtd.ssb.lo_chosen_supplycost_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * 25);
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = c_nation[ITEM];
+				/*atomicAdd(&res[hash * 4 + 2], (1));*/
+				/*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+				          (long long)(revenue[ITEM] - items[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+	BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void runQuery(int*                         lo_orderdate,
+              int*                         lo_orderdate_bw,
+              int*                         lo_orderdate_base,
+              int*                         lo_orderdate_offset,
+              int*                         lo_custkey,
+              int*                         lo_custkey_bw,
+              int*                         lo_custkey_base,
+              int*                         lo_custkey_offset,
+              int*                         lo_partkey,
+              int*                         lo_suppkey,
+              int*                         lo_revenue,
+              int*                         lo_supplycost,
+              int                          lo_len,
+              int*                         d_datekey,
+              int*                         d_year,
+              int                          d_len,
+              int*                         p_partkey,
+              int*                         p_mfgr,
+              int                          p_len,
+              int*                         s_suppkey,
+              int*                         s_region,
+              int                          s_len,
+              int*                         c_custkey,
+              int*                         c_region,
+              int*                         c_nation,
+              int                          c_len,
+              cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float time_query;
+
+	cudaEventRecord(start, 0);
+
+	int *ht_d, *ht_c, *ht_s, *ht_p;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	int* s_res = new int[s_len * 2];
+	CubDebugExit(cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int* c_res = new int[c_len * 2];
+	CubDebugExit(cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_mfgr, p_partkey, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int* p_res = new int[p_len * 2];
+	CubDebugExit(cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25);
+	int  ht_entries     = 4; // int,int,long long
+	int  res_array_size = res_size * ht_entries;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	probe<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+	                                                                lo_orderdate_bw,
+	                                                                lo_orderdate_base,
+	                                                                lo_orderdate_offset,
+	                                                                lo_partkey,
+	                                                                lo_custkey,
+	                                                                lo_custkey_bw,
+	                                                                lo_custkey_base,
+	                                                                lo_custkey_offset,
+	                                                                lo_suppkey,
+	                                                                lo_revenue,
+	                                                                lo_supplycost,
+	                                                                lo_len,
+	                                                                ht_p,
+	                                                                p_len,
+	                                                                ht_s,
+	                                                                s_len,
+	                                                                ht_c,
+	                                                                c_len,
+	                                                                ht_d,
+	                                                                d_val_len,
+	                                                                res);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+
+	// cout << "Result:" << endl;
+	// int res_count = 0;
+	// for (int i = 0; i < res_size; i++) {
+	// 	if (h_res[4 * i] != 0) {
+	// 		cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+	// 		     << reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0] << endl;
+	// 		res_count += 1;
+	// 	}
+	// }
+	//
+	// cout << "Res Count: " << res_count << endl;
+
+	ssb::SSBQuery4ResultTable result_of_query;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			result_of_query.emplace_back(
+			    h_res[4 * i], h_res[4 * i + 1], reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0]);
+		}
+	}
+
+	ASSERT_EQ(result_of_query.size(), ssb::ssb_q41_10.reuslt.size());
+	ASSERT_EQ(result_of_query, ssb::ssb_q41_10.reuslt);
+
+	delete[] h_res;
+}
+
+/**
+ * Main
+ */
+int main() {
+	auto hard_coded      = query_mtd.ssb;
+	int* h_lo_orderdate  = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_suppkey    = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_custkey    = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_partkey    = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_revenue    = loadColumn<int>("lo_revenue", LO_LEN);
+	int* h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	std::cout << "h_lo_orderdate | " << std::boolalpha << is_sorted(h_lo_orderdate, LO_LEN) << "\n";
+
+	int* h_lo_orderdate_base   = new int[n_vec];
+	int* h_lo_orderdate_bw     = new int[n_vec];
+	int* h_lo_orderdate_offset = new int[n_vec];
+
+	const int* h_enc_lo_orderdate  = new int[n_vec * 1024];
+	int*       h_lo_custkey_base   = new int[n_vec];
+	int*       h_lo_custkey_bw     = new int[n_vec];
+	int*       h_lo_custkey_offset = new int[n_vec];
+
+	const int* h_enc_lo_custkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue    = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_supplycost = new int[n_vec * 1024];
+
+	auto* orderdate_in  = const_cast<int32_t*>(tmp);
+	auto* custkey_in    = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in    = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in    = const_cast<int32_t*>(h_lo_revenue);
+	auto* partkey_in    = const_cast<int32_t*>(h_lo_partkey);
+	auto* supplycost_in = const_cast<int32_t*>(h_lo_supplycost);
+
+	auto* orderdate_out  = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out    = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out    = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out    = const_cast<int32_t*>(h_enc_lo_revenue);
+	auto* partkey_out    = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* supplycost_out = const_cast<int32_t*>(h_enc_lo_supplycost);
+
+	constexpr int SF10_LAST_VECTOR_IDX = 58580;
+	constexpr int LAST_VECTOR_SIZE     = 294;
+
+	h_lo_orderdate_offset[0] = 0;
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(custkey_in, LAST_VECTOR_SIZE); }
+		if (vec_idx == SF10_LAST_VECTOR_IDX) { set_zero_after<1024>(orderdate_in, LAST_VECTOR_SIZE); }
+
+		h_lo_orderdate_base[vec_idx] = find_base<1024>(orderdate_in);
+		subtract_base<1024>(orderdate_in, h_lo_orderdate_base[vec_idx]);
+		h_lo_orderdate_bw[vec_idx] = find_bw<1024>(orderdate_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_orderdate_offset[vec_idx + 1] = h_lo_orderdate_offset[vec_idx] + (h_lo_orderdate_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_orderdate_bw[vec_idx] > 16) {
+			std::cout << h_lo_orderdate_bw[vec_idx] << " bigger than 16 is not possible in orderdate! \n";
+			exit(-2);
+		}
+
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, h_lo_orderdate_bw[vec_idx]);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (h_lo_orderdate_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(supplycost_in, supplycost_out, hard_coded.lo_chosen_supplycost_bw);
+		supplycost_in  = supplycost_in + 1024;
+		supplycost_out = supplycost_out + (hard_coded.lo_chosen_supplycost_bw * 32);
+
+		h_lo_custkey_base[vec_idx] = find_base<1024>(custkey_in);
+		subtract_base<1024>(custkey_in, h_lo_custkey_base[vec_idx]);
+		h_lo_custkey_bw[vec_idx] = find_bw<1024>(custkey_in);
+
+		if (vec_idx + 1 < n_vec) {
+			h_lo_custkey_offset[vec_idx + 1] = h_lo_custkey_offset[vec_idx] + (h_lo_custkey_bw[vec_idx] * 32);
+		}
+
+		if (h_lo_custkey_bw[vec_idx] > 20) {
+			std::cout << vec_idx << std::endl;
+			std::cout << h_lo_custkey_bw[vec_idx] << "   bigger than 20 is not possible in custkey! \n";
+			exit(-2);
+		}
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, h_lo_custkey_bw[vec_idx]);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (h_lo_custkey_bw[vec_idx] * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate  = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey    = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey    = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue    = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_partkey    = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_supplycost = loadToGPU<int32_t>(h_enc_lo_supplycost, hard_coded.n_tup_line_order, g_allocator);
+
+	int* h_d_datekey      = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year         = loadColumn<int>("d_year", D_LEN);
+	int* h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_mfgr    = loadColumn<int>("p_mfgr", P_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+
+	cout << "** LOADED DATA **" << endl;
+
+	int* d_lo_orderdate_base   = loadToGPU<int32_t>(h_lo_orderdate_base, n_vec, g_allocator);
+	int* d_lo_orderdate_bw     = loadToGPU<int32_t>(h_lo_orderdate_bw, n_vec, g_allocator);
+	int* d_lo_orderdate_offset = loadToGPU<int32_t>(h_lo_orderdate_offset, n_vec, g_allocator);
+
+	int* d_lo_custkey_base   = loadToGPU<int32_t>(h_lo_custkey_base, n_vec, g_allocator);
+	int* d_lo_custkey_bw     = loadToGPU<int32_t>(h_lo_custkey_bw, n_vec, g_allocator);
+	int* d_lo_custkey_offset = loadToGPU<int32_t>(h_lo_custkey_offset, n_vec, g_allocator);
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_mfgr    = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	cout << "** LOADED DATA TO GPU **" << endl;
+
+	runQuery<32, 8>(d_lo_orderdate,
+	                d_lo_orderdate_bw,
+	                d_lo_orderdate_base,
+	                d_lo_orderdate_offset,
+	                d_lo_custkey,
+	                d_lo_custkey_bw,
+	                d_lo_custkey_base,
+	                d_lo_custkey_offset,
+	                d_lo_partkey,
+	                d_lo_suppkey,
+	                d_lo_revenue,
+	                d_lo_supplycost,
+	                LO_LEN,
+	                d_d_datekey,
+	                d_d_year,
+	                D_LEN,
+	                d_p_partkey,
+	                d_p_mfgr,
+	                P_LEN,
+	                d_s_suppkey,
+	                d_s_region,
+	                S_LEN,
+	                d_c_custkey,
+	                d_c_region,
+	                d_c_nation,
+	                C_LEN,
+	                g_allocator);
+
+	return 0;
+}
diff --git a/fastlanes/src/test_g.cu b/fastlanes/src/test_g.cu
new file mode 100644
index 0000000..50ed190
--- /dev/null
+++ b/fastlanes/src/test_g.cu
@@ -0,0 +1,209 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "cub/test/test_util.h"
+#include "data/footer/ssb/ssb.hpp"
+#include "fls_gen/unpack/hardcoded_16.cuh"
+// #include "fls_gen/unpack/unpack_fused.cuh"
+#include "crystal-opt/crystal.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <crystal_ssb_utils.h>
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+constexpr uint32_t CONSTANT_1 = make_simd_const(10);
+constexpr uint32_t CONSTANT_2 = make_simd_const(20);
+
+template <int BLOCK_THREADS, int IPT>
+__global__ void QueryKernel(const int*          enc_lo_orderdate,
+                            const int*          enc_lo_discount,
+                            const int*          enc_lo_quantity,
+                            int*                lo_extendedprice,
+                            ssb::SSB            query_mtd,
+                            unsigned long long* revenue) {
+	int TILE_SIZE = BLOCK_THREADS * IPT;
+	// Load a segment of consecutive items that are blocked across threads
+	uint16_t items[IPT];
+	uint16_t selection_flags[IPT];
+
+	int items2[IPT];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup_line_order + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup_line_order - tile_offset; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.lo_orderdate_bw * 32;
+	hardcoded::unpack_device(enc_lo_orderdate + orderdate_tile_offset, items, query_mtd.lo_orderdate_bw);
+
+	for (size_t i {0}; i < 32; ++i) {
+		items[i] = i;
+		selection_flags[i] = 1;
+	}
+	// BlockPredGT_int_16_2<BLOCK_THREADS, IPT>(
+	//     //
+	//     reinterpret_cast<uint32_t(&)[IPT]>(items),
+	//     CONSTANT_1,
+	//     reinterpret_cast<uint32_t(&)[IPT]>(selection_flags),
+	//     num_tile_items);
+	BlockPredAndLTX<BLOCK_THREADS, IPT>(
+	    //
+	    reinterpret_cast<uint32_t(&)[IPT]>(items),
+	    CONSTANT_2,
+	    reinterpret_cast<uint32_t(&)[IPT]>(selection_flags),
+	    num_tile_items);
+
+	for (size_t i {0}; i < 32; ++i) {
+	printf("%d\n", selection_flags[i]);
+	}
+
+
+
+	int quantity_tile_offset = blockIdx.x * query_mtd.lo_quantity_bw * 32;
+	hardcoded::unpack_device(enc_lo_quantity + quantity_tile_offset, items, query_mtd.lo_quantity_bw);
+	BlockPredAndLT<uint16_t, uint16_t, BLOCK_THREADS, IPT>(items, 25, selection_flags, num_tile_items);
+
+	int discount_tile_offset = blockIdx.x * query_mtd.lo_discount_bw * 32;
+	hardcoded::unpack_device(enc_lo_discount + discount_tile_offset, items, query_mtd.lo_discount_bw);
+	BlockPredAndGTE<uint16_t, uint16_t, BLOCK_THREADS, IPT>(items, 1, selection_flags, num_tile_items);
+	BlockPredAndLTE<uint16_t, uint16_t, BLOCK_THREADS, IPT>(items, 3, selection_flags, num_tile_items);
+
+	BlockPredLoad<int, uint16_t, BLOCK_THREADS, IPT>(
+	    lo_extendedprice + tile_offset, items2, num_tile_items, selection_flags);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < IPT; ++ITEM) {
+		if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items))
+			if (selection_flags[ITEM]) sum += items[ITEM] * items2[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long          aggregate = BlockSum<long long, BLOCK_THREADS, IPT>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query(int*                         lo_orderdate,
+            int*                         lo_discount,
+            int*                         lo_quantity,
+            int*                         lo_extendedprice,
+            ssb::SsbQueryMtd             query_mtd,
+            cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+
+	cudaEventRecord(start, 0);
+
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int num_blocks = (query_mtd.ssb.n_tup_line_order + tile_items - 1) / tile_items;
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<1, 1>>>(lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, query_mtd.ssb, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != query_mtd.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+	FLS_SUCCESS(query_mtd.ssb.name)
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+int main() {
+	int  num_trials  = 3;
+	auto queries_mtd = {
+	    //
+	    ssb::ssb_q11_10,
+	    //
+	};
+	for (const auto query_mtd : queries_mtd) {
+		auto hard_coded         = query_mtd.ssb;
+		int* h_lo_orderdate     = loadColumn<int>("lo_orderdate", LO_LEN);
+		int* h_lo_discount      = loadColumn<int>("lo_discount", LO_LEN);
+		int* h_lo_quantity      = loadColumn<int>("lo_quantity", LO_LEN);
+		int* h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN);
+
+		auto n_vec = hard_coded.n_vec;
+
+		int* tmp = new int[n_vec * 1024];
+		for (size_t i {0}; i < LO_LEN; ++i) {
+			tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+		}
+
+		const int* h_enc_lo_orderdate = new int[n_vec * 1024];
+		const int* h_enc_lo_discount  = new int[n_vec * 1024];
+		const int* h_enc_lo_quantity  = new int[n_vec * 1024];
+
+		auto* orderdate_in = const_cast<const int32_t*>(tmp);
+		auto* discount_in  = const_cast<int32_t*>(h_lo_discount);
+		auto* quantity_in  = const_cast<int32_t*>(h_lo_quantity);
+
+		auto* orderdate_out = const_cast<int32_t*>(h_enc_lo_orderdate);
+		auto* discount_out  = const_cast<int32_t*>(h_enc_lo_discount);
+		auto* quantity_out  = const_cast<int32_t*>(h_enc_lo_quantity);
+
+		for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+			generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+			orderdate_in  = orderdate_in + 1024;
+			orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+			generated::pack::fallback::scalar::pack(discount_in, discount_out, hard_coded.lo_discount_bw);
+			discount_in  = discount_in + 1024;
+			discount_out = discount_out + (hard_coded.lo_discount_bw * 32);
+
+			generated::pack::fallback::scalar::pack(quantity_in, quantity_out, hard_coded.lo_quantity_bw);
+			quantity_in  = quantity_in + 1024;
+			quantity_out = quantity_out + (hard_coded.lo_quantity_bw * 32);
+		}
+
+		FLS_LOG("LOADED DATA")
+
+		int* d_lo_orderdate     = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_discount      = loadToGPU<int32_t>(h_enc_lo_discount, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_quantity      = loadToGPU<int32_t>(h_enc_lo_quantity, hard_coded.n_tup_line_order, g_allocator);
+		int* d_lo_extendedprice = loadToGPU<int32_t>(h_lo_extendedprice, hard_coded.n_tup_line_order, g_allocator);
+
+		FLS_LOG("LOADED DATA TO GPU")
+
+		for (int n = 0; n < num_trials; n++) {
+			auto t =
+			    query<1, 32>(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, query_mtd, g_allocator);
+			FLS_RESULT(t)
+		}
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/fastlanes/src/tmp/fls_q41_bitpacked_opt_v2.cu b/fastlanes/src/tmp/fls_q41_bitpacked_opt_v2.cu
new file mode 100644
index 0000000..1b5e79b
--- /dev/null
+++ b/fastlanes/src/tmp/fls_q41_bitpacked_opt_v2.cu
@@ -0,0 +1,485 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "crystal.cuh"
+#include "crystal_ssb_utils.h"
+#include "cub/test/test_util.h"
+#include "data/footer/ssb/ssb.hpp"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+#include <curand.h>
+#include <fls_gen/pack/pack.hpp>
+#include <fls_gen/unpack/hardcoded_16.cuh>
+#include <iostream>
+#include <stdio.h>
+
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+using namespace std;
+
+auto query_mtd = ssb::ssb_q13_1;
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void probe(int* lo_orderdate,
+                      int* lo_partkey,
+                      int* lo_custkey,
+                      int* lo_suppkey,
+                      int* lo_revenue,
+                      int* lo_supplycost,
+                      int  lo_len,
+                      int* ht_p,
+                      int  p_len,
+                      int* ht_s,
+                      int  s_len,
+                      int* ht_c,
+                      int  c_len,
+                      int* ht_d,
+                      int  d_len,
+                      int* res) {
+	// Load a segment of consecutive items that are blocked across threads
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+	int c_nation[ITEMS_PER_THREAD];
+	int s_nation[ITEMS_PER_THREAD];
+	int year[ITEMS_PER_THREAD];
+	int revenue[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (lo_len + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_len - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+
+	int suppkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_suppkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_suppkey + suppkey_tile_offset, items, query_mtd.ssb.lo_chosen_suppkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_s, s_len, num_tile_items);
+	if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+	int custkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_custkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_custkey + custkey_tile_offset, items, query_mtd.ssb.lo_chosen_custkey_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, c_nation, selection_flags, ht_c, c_len, num_tile_items);
+	if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+	int partkey_tile_offset = blockIdx.x * query_mtd.ssb.lo_partkey_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_partkey + partkey_tile_offset, items, query_mtd.ssb.lo_partkey_bw);
+	BlockProbeAndPHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht_p, p_len, num_tile_items);
+	if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+	int orderdate_tile_offset = blockIdx.x * query_mtd.ssb.lo_orderdate_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_orderdate + orderdate_tile_offset, items, query_mtd.ssb.lo_orderdate_bw);
+	BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, year, selection_flags, ht_d, d_len, 0, num_tile_items);
+	if (IsTerm<int, BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags)) { return; }
+
+	BlockPredLoad<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    lo_revenue + tile_offset, revenue, num_tile_items, selection_flags);
+
+	int supplycost_tile_offset = blockIdx.x * query_mtd.ssb.lo_chosen_supplycost_bw * 8;
+	unpack_8_at_a_time::unpack_device(lo_supplycost + supplycost_tile_offset, items, query_mtd.ssb.lo_chosen_supplycost_bw);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		if (threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items) {
+			if (selection_flags[ITEM]) {
+				int hash          = (c_nation[ITEM] * 7 + (year[ITEM] - 1992)) % ((1998 - 1992 + 1) * 25);
+				res[hash * 4]     = year[ITEM];
+				res[hash * 4 + 1] = c_nation[ITEM];
+				/*atomicAdd(&res[hash * 4 + 2], (1));*/
+				/*atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]), (long long)(1));*/
+				atomicAdd(reinterpret_cast<unsigned long long*>(&res[hash * 4 + 2]),
+				          (long long)(revenue[ITEM] - items[ITEM]));
+			}
+		}
+	}
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_s(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void build_hashtable_p(int* filter_col, int* dim_key, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 0, selection_flags, num_tile_items);
+	BlockPredOrEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockBuildSelectivePHT_1<int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_c(int* filter_col, int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(filter_col + tile_offset, items, num_tile_items);
+	BlockPredEQ<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items);
+
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+build_hashtable_d(int* dim_key, int* dim_val, int num_tuples, int* hash_table, int num_slots, int val_min) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	int items[ITEMS_PER_THREAD];
+	int items2[ITEMS_PER_THREAD];
+	int selection_flags[ITEMS_PER_THREAD];
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (num_tuples + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; }
+
+	InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items);
+	BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items);
+	BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    items, items2, selection_flags, hash_table, num_slots, val_min, num_tile_items);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float runQuery(int*                         lo_orderdate,
+               int*                         lo_custkey,
+               int*                         lo_partkey,
+               int*                         lo_suppkey,
+               int*                         lo_revenue,
+               int*                         lo_supplycost,
+               int                          lo_len,
+               int*                         d_datekey,
+               int*                         d_year,
+               int                          d_len,
+               int*                         p_partkey,
+               int*                         p_mfgr,
+               int                          p_len,
+               int*                         s_suppkey,
+               int*                         s_region,
+               int                          s_len,
+               int*                         c_custkey,
+               int*                         c_region,
+               int*                         c_nation,
+               int                          c_len,
+               cub::CachingDeviceAllocator& g_allocator) {
+	SETUP_TIMING();
+
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+
+	cudaEventRecord(start, 0);
+
+	int *ht_d, *ht_c, *ht_s, *ht_p;
+	int  d_val_len = 19981230 - 19920101 + 1;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_d, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_s, 2 * s_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_c, 2 * c_len * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&ht_p, 2 * p_len * sizeof(int)));
+
+	CubDebugExit(cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_s, 0, 2 * s_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_c, 0, 2 * c_len * sizeof(int)));
+	CubDebugExit(cudaMemset(ht_p, 0, 2 * p_len * sizeof(int)));
+
+	int tile_items = BLOCK_THREADS * ITEMS_PER_THREAD;
+	build_hashtable_s<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(s_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(s_region, s_suppkey, s_len, ht_s, s_len);
+	/*CHECK_ERROR();*/
+
+	int* s_res = new int[s_len * 2];
+	CubDebugExit(cudaMemcpy(s_res, ht_s, s_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_c<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(c_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(c_region, c_custkey, c_nation, c_len, ht_c, c_len);
+	/*CHECK_ERROR();*/
+
+	int* c_res = new int[c_len * 2];
+	CubDebugExit(cudaMemcpy(c_res, ht_c, c_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	build_hashtable_p<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<(p_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(p_mfgr, p_partkey, p_len, ht_p, p_len);
+	/*CHECK_ERROR();*/
+
+	int* p_res = new int[p_len * 2];
+	CubDebugExit(cudaMemcpy(p_res, ht_p, p_len * 2 * sizeof(int), cudaMemcpyDeviceToHost));
+
+	int d_val_min = 19920101;
+	build_hashtable_d<BLOCK_THREADS, ITEMS_PER_THREAD><<<(d_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(
+	    d_datekey, d_year, d_len, ht_d, d_val_len, d_val_min);
+	/*CHECK_ERROR();*/
+
+#if 0
+  int *h_ht_s = new int[s_len * 2];
+  int *h_ht_c = new int[c_len * 2];
+  int *h_ht_p = new int[p_len * 2];
+  int *h_ht_d = new int[d_val_len * 2];
+
+  int num_s = 0 , num_c = 0, num_d = 0, num_p = 0;
+
+  CubDebugExit(cudaMemcpy(h_ht_s, ht_s, 2 * s_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<s_len; i++) if (h_ht_s[i*2] != 0) num_s += 1;
+
+  cout << "Num Matched" << " " << num_s << " " << s_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_d, ht_d, 2 * d_val_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<d_val_len; i++) if (h_ht_d[i*2] != 0) num_d += 1;
+
+  cout << "Num Matched" << " " << num_d << " " << d_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_c, ht_c, 2 * c_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<c_len; i++) if (h_ht_c[i*2] != 0) num_c += 1;
+
+  cout << "Num Matched" << " " << num_c << " " << c_len << endl;
+
+  CubDebugExit(cudaMemcpy(h_ht_p, ht_p, 2 * p_len * sizeof(int), cudaMemcpyDeviceToHost));
+  for (int i=0; i<p_len; i++) if (h_ht_p[i*2] != 0) num_p += 1;
+
+  cout << "Num Matched" << " " << num_p << " " << p_len << endl;
+#endif
+
+	int* res;
+	int  res_size       = ((1998 - 1992 + 1) * 25);
+	int  ht_entries     = 4; // int,int,long long
+	int  res_array_size = res_size * ht_entries;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&res, res_array_size * sizeof(int)));
+
+	CubDebugExit(cudaMemset(res, 0, res_array_size * sizeof(int)));
+
+	// Run
+	probe<BLOCK_THREADS, ITEMS_PER_THREAD><<<(lo_len + tile_items - 1) / tile_items, BLOCK_THREADS>>>(lo_orderdate,
+	                                                                                                  lo_partkey,
+	                                                                                                  lo_custkey,
+	                                                                                                  lo_suppkey,
+	                                                                                                  lo_revenue,
+	                                                                                                  lo_supplycost,
+	                                                                                                  lo_len,
+	                                                                                                  ht_p,
+	                                                                                                  p_len,
+	                                                                                                  ht_s,
+	                                                                                                  s_len,
+	                                                                                                  ht_c,
+	                                                                                                  c_len,
+	                                                                                                  ht_d,
+	                                                                                                  d_val_len,
+	                                                                                                  res);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	int* h_res = new int[res_array_size];
+	CubDebugExit(cudaMemcpy(h_res, res, res_array_size * sizeof(int), cudaMemcpyDeviceToHost));
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	cout << "Result:" << endl;
+	int res_count = 0;
+	for (int i = 0; i < res_size; i++) {
+		if (h_res[4 * i] != 0) {
+			cout << h_res[4 * i] << " " << h_res[4 * i + 1] << " "
+			     << reinterpret_cast<unsigned long long*>(&h_res[4 * i + 2])[0] << endl;
+			res_count += 1;
+		}
+	}
+
+	cout << "Res Count: " << res_count << endl;
+	cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+	delete[] h_res;
+
+	return time_query;
+}
+
+/**
+ * Main
+ */
+int main() {
+	int num_trials = 3;
+
+	auto query_mtd = ssb::ssb_q11_10;
+
+	auto hard_coded      = query_mtd.ssb;
+	int* h_lo_orderdate  = loadColumn<int>("lo_orderdate", LO_LEN);
+	int* h_lo_suppkey    = loadColumn<int>("lo_suppkey", LO_LEN);
+	int* h_lo_custkey    = loadColumn<int>("lo_custkey", LO_LEN);
+	int* h_lo_partkey    = loadColumn<int>("lo_partkey", LO_LEN);
+	int* h_lo_revenue    = loadColumn<int>("lo_revenue", LO_LEN);
+	int* h_lo_supplycost = loadColumn<int>("lo_supplycost", LO_LEN);
+
+	auto n_vec = hard_coded.n_vec;
+
+	int* tmp = new int[n_vec * 1024];
+	for (size_t i {0}; i < LO_LEN; ++i) {
+		tmp[i] = h_lo_orderdate[i] - hard_coded.lo_orderdate_min;
+	}
+
+	const int* h_enc_lo_orderdate  = new int[n_vec * 1024];
+	const int* h_enc_lo_custkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_suppkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_revenue    = new int[n_vec * 1024];
+	const int* h_enc_lo_partkey    = new int[n_vec * 1024];
+	const int* h_enc_lo_supplycost = new int[n_vec * 1024];
+
+	auto* orderdate_in  = const_cast<const int32_t*>(tmp);
+	auto* custkey_in    = const_cast<int32_t*>(h_lo_custkey);
+	auto* suppkey_in    = const_cast<int32_t*>(h_lo_suppkey);
+	auto* revenue_in    = const_cast<int32_t*>(h_lo_revenue);
+	auto* partkey_in    = const_cast<int32_t*>(h_lo_partkey);
+	auto* supplycost_in = const_cast<int32_t*>(h_lo_supplycost);
+
+	auto* orderdate_out  = const_cast<int32_t*>(h_enc_lo_orderdate);
+	auto* custkey_out    = const_cast<int32_t*>(h_enc_lo_custkey);
+	auto* suppkey_out    = const_cast<int32_t*>(h_enc_lo_suppkey);
+	auto* revenue_out    = const_cast<int32_t*>(h_enc_lo_revenue);
+	auto* partkey_out    = const_cast<int32_t*>(h_enc_lo_partkey);
+	auto* supplycost_out = const_cast<int32_t*>(h_enc_lo_supplycost);
+
+	for (uint64_t vec_idx {0}; vec_idx < n_vec; vec_idx++) {
+		generated::pack::fallback::scalar::pack(orderdate_in, orderdate_out, hard_coded.lo_orderdate_bw);
+		orderdate_in  = orderdate_in + 1024;
+		orderdate_out = orderdate_out + (hard_coded.lo_orderdate_bw * 32);
+
+		generated::pack::fallback::scalar::pack(partkey_in, partkey_out, hard_coded.lo_partkey_bw);
+		partkey_in  = partkey_in + 1024;
+		partkey_out = partkey_out + (hard_coded.lo_partkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(supplycost_in, supplycost_out, hard_coded.lo_chosen_supplycost_bw);
+		supplycost_in  = supplycost_in + 1024;
+		supplycost_out = supplycost_out + (hard_coded.lo_chosen_supplycost_bw * 32);
+
+		generated::pack::fallback::scalar::pack(custkey_in, custkey_out, hard_coded.lo_chosen_custkey_bw);
+		custkey_in  = custkey_in + 1024;
+		custkey_out = custkey_out + (hard_coded.lo_chosen_custkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(suppkey_in, suppkey_out, hard_coded.lo_chosen_suppkey_bw);
+		suppkey_in  = suppkey_in + 1024;
+		suppkey_out = suppkey_out + (hard_coded.lo_chosen_suppkey_bw * 32);
+
+		generated::pack::fallback::scalar::pack(revenue_in, revenue_out, hard_coded.lo_revenue_bw);
+		revenue_in  = revenue_in + 1024;
+		revenue_out = revenue_out + (hard_coded.lo_revenue_bw * 32);
+	}
+
+	int* d_lo_orderdate  = loadToGPU<int32_t>(h_enc_lo_orderdate, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_custkey    = loadToGPU<int32_t>(h_enc_lo_custkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_suppkey    = loadToGPU<int32_t>(h_enc_lo_suppkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_revenue    = loadToGPU<int32_t>(h_lo_revenue, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_partkey    = loadToGPU<int32_t>(h_enc_lo_partkey, hard_coded.n_tup_line_order, g_allocator);
+	int* d_lo_supplycost = loadToGPU<int32_t>(h_enc_lo_supplycost, hard_coded.n_tup_line_order, g_allocator);
+
+	int* h_d_datekey      = loadColumn<int>("d_datekey", D_LEN);
+	int* h_d_year         = loadColumn<int>("d_year", D_LEN);
+	int* h_d_yearmonthnum = loadColumn<int>("d_yearmonthnum", D_LEN);
+
+	int* h_s_suppkey = loadColumn<int>("s_suppkey", S_LEN);
+	int* h_s_region  = loadColumn<int>("s_region", S_LEN);
+
+	int* h_p_partkey = loadColumn<int>("p_partkey", P_LEN);
+	int* h_p_mfgr    = loadColumn<int>("p_mfgr", P_LEN);
+
+	int* h_c_custkey = loadColumn<int>("c_custkey", C_LEN);
+	int* h_c_region  = loadColumn<int>("c_region", C_LEN);
+	int* h_c_nation  = loadColumn<int>("c_nation", C_LEN);
+
+	cout << "** LOADED DATA **" << endl;
+
+	int* d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator);
+	int* d_d_year    = loadToGPU<int>(h_d_year, D_LEN, g_allocator);
+
+	int* d_p_partkey = loadToGPU<int>(h_p_partkey, P_LEN, g_allocator);
+	int* d_p_mfgr    = loadToGPU<int>(h_p_mfgr, P_LEN, g_allocator);
+
+	int* d_s_suppkey = loadToGPU<int>(h_s_suppkey, S_LEN, g_allocator);
+	int* d_s_region  = loadToGPU<int>(h_s_region, S_LEN, g_allocator);
+
+	int* d_c_custkey = loadToGPU<int>(h_c_custkey, C_LEN, g_allocator);
+	int* d_c_region  = loadToGPU<int>(h_c_region, C_LEN, g_allocator);
+	int* d_c_nation  = loadToGPU<int>(h_c_nation, C_LEN, g_allocator);
+
+	cout << "** LOADED DATA TO GPU **" << endl;
+
+	for (int t = 0; t < num_trials; t++) {
+		float time_query;
+
+		time_query = runQuery<32, 8>(d_lo_orderdate,
+		                              d_lo_custkey,
+		                              d_lo_partkey,
+		                              d_lo_suppkey,
+		                              d_lo_revenue,
+		                              d_lo_supplycost,
+		                              LO_LEN,
+		                              d_d_datekey,
+		                              d_d_year,
+		                              D_LEN,
+		                              d_p_partkey,
+		                              d_p_mfgr,
+		                              P_LEN,
+		                              d_s_suppkey,
+		                              d_s_region,
+		                              S_LEN,
+		                              d_c_custkey,
+		                              d_c_region,
+		                              d_c_nation,
+		                              C_LEN,
+		                              g_allocator);
+		cout << "{"
+		     << "\"query\":41"
+		     << ",\"time_query\":" << time_query << "}" << endl;
+	}
+
+	return 0;
+}
diff --git a/fastlanes/src/transpose.cpp b/fastlanes/src/transpose.cpp
new file mode 100644
index 0000000..7a09819
--- /dev/null
+++ b/fastlanes/src/transpose.cpp
@@ -0,0 +1,8215 @@
+// generated!
+// NOLINTBEGIN
+#include "fls_gen/macros.hpp"
+#include "fls_gen/transpose/transpose.hpp"
+namespace generated { namespace transpose::fallback { namespace scalar {
+void transpose_i(const uint8_t* __restrict a_in_p, uint8_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[1023] = a_in_p[1023];
+}
+void transpose_o(const uint8_t* __restrict a_in_p, uint8_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[1023] = a_in_p[1023];
+}
+void transpose_i(const uint16_t* __restrict a_in_p, uint16_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[1023] = a_in_p[1023];
+}
+void transpose_o(const uint16_t* __restrict a_in_p, uint16_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[1023] = a_in_p[1023];
+}
+void transpose_i(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[1023] = a_in_p[1023];
+}
+void transpose_o(const uint32_t* __restrict a_in_p, uint32_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[1023] = a_in_p[1023];
+}
+void transpose_i(const uint64_t* __restrict a_in_p, uint64_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[1023] = a_in_p[1023];
+}
+void transpose_o(const uint64_t* __restrict a_in_p, uint64_t* __restrict a_out_p) {
+	a_out_p[0]    = a_in_p[0];
+	a_out_p[128]  = a_in_p[1];
+	a_out_p[256]  = a_in_p[2];
+	a_out_p[384]  = a_in_p[3];
+	a_out_p[512]  = a_in_p[4];
+	a_out_p[640]  = a_in_p[5];
+	a_out_p[768]  = a_in_p[6];
+	a_out_p[896]  = a_in_p[7];
+	a_out_p[64]   = a_in_p[8];
+	a_out_p[192]  = a_in_p[9];
+	a_out_p[320]  = a_in_p[10];
+	a_out_p[448]  = a_in_p[11];
+	a_out_p[576]  = a_in_p[12];
+	a_out_p[704]  = a_in_p[13];
+	a_out_p[832]  = a_in_p[14];
+	a_out_p[960]  = a_in_p[15];
+	a_out_p[32]   = a_in_p[16];
+	a_out_p[160]  = a_in_p[17];
+	a_out_p[288]  = a_in_p[18];
+	a_out_p[416]  = a_in_p[19];
+	a_out_p[544]  = a_in_p[20];
+	a_out_p[672]  = a_in_p[21];
+	a_out_p[800]  = a_in_p[22];
+	a_out_p[928]  = a_in_p[23];
+	a_out_p[96]   = a_in_p[24];
+	a_out_p[224]  = a_in_p[25];
+	a_out_p[352]  = a_in_p[26];
+	a_out_p[480]  = a_in_p[27];
+	a_out_p[608]  = a_in_p[28];
+	a_out_p[736]  = a_in_p[29];
+	a_out_p[864]  = a_in_p[30];
+	a_out_p[992]  = a_in_p[31];
+	a_out_p[16]   = a_in_p[32];
+	a_out_p[144]  = a_in_p[33];
+	a_out_p[272]  = a_in_p[34];
+	a_out_p[400]  = a_in_p[35];
+	a_out_p[528]  = a_in_p[36];
+	a_out_p[656]  = a_in_p[37];
+	a_out_p[784]  = a_in_p[38];
+	a_out_p[912]  = a_in_p[39];
+	a_out_p[80]   = a_in_p[40];
+	a_out_p[208]  = a_in_p[41];
+	a_out_p[336]  = a_in_p[42];
+	a_out_p[464]  = a_in_p[43];
+	a_out_p[592]  = a_in_p[44];
+	a_out_p[720]  = a_in_p[45];
+	a_out_p[848]  = a_in_p[46];
+	a_out_p[976]  = a_in_p[47];
+	a_out_p[48]   = a_in_p[48];
+	a_out_p[176]  = a_in_p[49];
+	a_out_p[304]  = a_in_p[50];
+	a_out_p[432]  = a_in_p[51];
+	a_out_p[560]  = a_in_p[52];
+	a_out_p[688]  = a_in_p[53];
+	a_out_p[816]  = a_in_p[54];
+	a_out_p[944]  = a_in_p[55];
+	a_out_p[112]  = a_in_p[56];
+	a_out_p[240]  = a_in_p[57];
+	a_out_p[368]  = a_in_p[58];
+	a_out_p[496]  = a_in_p[59];
+	a_out_p[624]  = a_in_p[60];
+	a_out_p[752]  = a_in_p[61];
+	a_out_p[880]  = a_in_p[62];
+	a_out_p[1008] = a_in_p[63];
+	a_out_p[1]    = a_in_p[64];
+	a_out_p[129]  = a_in_p[65];
+	a_out_p[257]  = a_in_p[66];
+	a_out_p[385]  = a_in_p[67];
+	a_out_p[513]  = a_in_p[68];
+	a_out_p[641]  = a_in_p[69];
+	a_out_p[769]  = a_in_p[70];
+	a_out_p[897]  = a_in_p[71];
+	a_out_p[65]   = a_in_p[72];
+	a_out_p[193]  = a_in_p[73];
+	a_out_p[321]  = a_in_p[74];
+	a_out_p[449]  = a_in_p[75];
+	a_out_p[577]  = a_in_p[76];
+	a_out_p[705]  = a_in_p[77];
+	a_out_p[833]  = a_in_p[78];
+	a_out_p[961]  = a_in_p[79];
+	a_out_p[33]   = a_in_p[80];
+	a_out_p[161]  = a_in_p[81];
+	a_out_p[289]  = a_in_p[82];
+	a_out_p[417]  = a_in_p[83];
+	a_out_p[545]  = a_in_p[84];
+	a_out_p[673]  = a_in_p[85];
+	a_out_p[801]  = a_in_p[86];
+	a_out_p[929]  = a_in_p[87];
+	a_out_p[97]   = a_in_p[88];
+	a_out_p[225]  = a_in_p[89];
+	a_out_p[353]  = a_in_p[90];
+	a_out_p[481]  = a_in_p[91];
+	a_out_p[609]  = a_in_p[92];
+	a_out_p[737]  = a_in_p[93];
+	a_out_p[865]  = a_in_p[94];
+	a_out_p[993]  = a_in_p[95];
+	a_out_p[17]   = a_in_p[96];
+	a_out_p[145]  = a_in_p[97];
+	a_out_p[273]  = a_in_p[98];
+	a_out_p[401]  = a_in_p[99];
+	a_out_p[529]  = a_in_p[100];
+	a_out_p[657]  = a_in_p[101];
+	a_out_p[785]  = a_in_p[102];
+	a_out_p[913]  = a_in_p[103];
+	a_out_p[81]   = a_in_p[104];
+	a_out_p[209]  = a_in_p[105];
+	a_out_p[337]  = a_in_p[106];
+	a_out_p[465]  = a_in_p[107];
+	a_out_p[593]  = a_in_p[108];
+	a_out_p[721]  = a_in_p[109];
+	a_out_p[849]  = a_in_p[110];
+	a_out_p[977]  = a_in_p[111];
+	a_out_p[49]   = a_in_p[112];
+	a_out_p[177]  = a_in_p[113];
+	a_out_p[305]  = a_in_p[114];
+	a_out_p[433]  = a_in_p[115];
+	a_out_p[561]  = a_in_p[116];
+	a_out_p[689]  = a_in_p[117];
+	a_out_p[817]  = a_in_p[118];
+	a_out_p[945]  = a_in_p[119];
+	a_out_p[113]  = a_in_p[120];
+	a_out_p[241]  = a_in_p[121];
+	a_out_p[369]  = a_in_p[122];
+	a_out_p[497]  = a_in_p[123];
+	a_out_p[625]  = a_in_p[124];
+	a_out_p[753]  = a_in_p[125];
+	a_out_p[881]  = a_in_p[126];
+	a_out_p[1009] = a_in_p[127];
+	a_out_p[2]    = a_in_p[128];
+	a_out_p[130]  = a_in_p[129];
+	a_out_p[258]  = a_in_p[130];
+	a_out_p[386]  = a_in_p[131];
+	a_out_p[514]  = a_in_p[132];
+	a_out_p[642]  = a_in_p[133];
+	a_out_p[770]  = a_in_p[134];
+	a_out_p[898]  = a_in_p[135];
+	a_out_p[66]   = a_in_p[136];
+	a_out_p[194]  = a_in_p[137];
+	a_out_p[322]  = a_in_p[138];
+	a_out_p[450]  = a_in_p[139];
+	a_out_p[578]  = a_in_p[140];
+	a_out_p[706]  = a_in_p[141];
+	a_out_p[834]  = a_in_p[142];
+	a_out_p[962]  = a_in_p[143];
+	a_out_p[34]   = a_in_p[144];
+	a_out_p[162]  = a_in_p[145];
+	a_out_p[290]  = a_in_p[146];
+	a_out_p[418]  = a_in_p[147];
+	a_out_p[546]  = a_in_p[148];
+	a_out_p[674]  = a_in_p[149];
+	a_out_p[802]  = a_in_p[150];
+	a_out_p[930]  = a_in_p[151];
+	a_out_p[98]   = a_in_p[152];
+	a_out_p[226]  = a_in_p[153];
+	a_out_p[354]  = a_in_p[154];
+	a_out_p[482]  = a_in_p[155];
+	a_out_p[610]  = a_in_p[156];
+	a_out_p[738]  = a_in_p[157];
+	a_out_p[866]  = a_in_p[158];
+	a_out_p[994]  = a_in_p[159];
+	a_out_p[18]   = a_in_p[160];
+	a_out_p[146]  = a_in_p[161];
+	a_out_p[274]  = a_in_p[162];
+	a_out_p[402]  = a_in_p[163];
+	a_out_p[530]  = a_in_p[164];
+	a_out_p[658]  = a_in_p[165];
+	a_out_p[786]  = a_in_p[166];
+	a_out_p[914]  = a_in_p[167];
+	a_out_p[82]   = a_in_p[168];
+	a_out_p[210]  = a_in_p[169];
+	a_out_p[338]  = a_in_p[170];
+	a_out_p[466]  = a_in_p[171];
+	a_out_p[594]  = a_in_p[172];
+	a_out_p[722]  = a_in_p[173];
+	a_out_p[850]  = a_in_p[174];
+	a_out_p[978]  = a_in_p[175];
+	a_out_p[50]   = a_in_p[176];
+	a_out_p[178]  = a_in_p[177];
+	a_out_p[306]  = a_in_p[178];
+	a_out_p[434]  = a_in_p[179];
+	a_out_p[562]  = a_in_p[180];
+	a_out_p[690]  = a_in_p[181];
+	a_out_p[818]  = a_in_p[182];
+	a_out_p[946]  = a_in_p[183];
+	a_out_p[114]  = a_in_p[184];
+	a_out_p[242]  = a_in_p[185];
+	a_out_p[370]  = a_in_p[186];
+	a_out_p[498]  = a_in_p[187];
+	a_out_p[626]  = a_in_p[188];
+	a_out_p[754]  = a_in_p[189];
+	a_out_p[882]  = a_in_p[190];
+	a_out_p[1010] = a_in_p[191];
+	a_out_p[3]    = a_in_p[192];
+	a_out_p[131]  = a_in_p[193];
+	a_out_p[259]  = a_in_p[194];
+	a_out_p[387]  = a_in_p[195];
+	a_out_p[515]  = a_in_p[196];
+	a_out_p[643]  = a_in_p[197];
+	a_out_p[771]  = a_in_p[198];
+	a_out_p[899]  = a_in_p[199];
+	a_out_p[67]   = a_in_p[200];
+	a_out_p[195]  = a_in_p[201];
+	a_out_p[323]  = a_in_p[202];
+	a_out_p[451]  = a_in_p[203];
+	a_out_p[579]  = a_in_p[204];
+	a_out_p[707]  = a_in_p[205];
+	a_out_p[835]  = a_in_p[206];
+	a_out_p[963]  = a_in_p[207];
+	a_out_p[35]   = a_in_p[208];
+	a_out_p[163]  = a_in_p[209];
+	a_out_p[291]  = a_in_p[210];
+	a_out_p[419]  = a_in_p[211];
+	a_out_p[547]  = a_in_p[212];
+	a_out_p[675]  = a_in_p[213];
+	a_out_p[803]  = a_in_p[214];
+	a_out_p[931]  = a_in_p[215];
+	a_out_p[99]   = a_in_p[216];
+	a_out_p[227]  = a_in_p[217];
+	a_out_p[355]  = a_in_p[218];
+	a_out_p[483]  = a_in_p[219];
+	a_out_p[611]  = a_in_p[220];
+	a_out_p[739]  = a_in_p[221];
+	a_out_p[867]  = a_in_p[222];
+	a_out_p[995]  = a_in_p[223];
+	a_out_p[19]   = a_in_p[224];
+	a_out_p[147]  = a_in_p[225];
+	a_out_p[275]  = a_in_p[226];
+	a_out_p[403]  = a_in_p[227];
+	a_out_p[531]  = a_in_p[228];
+	a_out_p[659]  = a_in_p[229];
+	a_out_p[787]  = a_in_p[230];
+	a_out_p[915]  = a_in_p[231];
+	a_out_p[83]   = a_in_p[232];
+	a_out_p[211]  = a_in_p[233];
+	a_out_p[339]  = a_in_p[234];
+	a_out_p[467]  = a_in_p[235];
+	a_out_p[595]  = a_in_p[236];
+	a_out_p[723]  = a_in_p[237];
+	a_out_p[851]  = a_in_p[238];
+	a_out_p[979]  = a_in_p[239];
+	a_out_p[51]   = a_in_p[240];
+	a_out_p[179]  = a_in_p[241];
+	a_out_p[307]  = a_in_p[242];
+	a_out_p[435]  = a_in_p[243];
+	a_out_p[563]  = a_in_p[244];
+	a_out_p[691]  = a_in_p[245];
+	a_out_p[819]  = a_in_p[246];
+	a_out_p[947]  = a_in_p[247];
+	a_out_p[115]  = a_in_p[248];
+	a_out_p[243]  = a_in_p[249];
+	a_out_p[371]  = a_in_p[250];
+	a_out_p[499]  = a_in_p[251];
+	a_out_p[627]  = a_in_p[252];
+	a_out_p[755]  = a_in_p[253];
+	a_out_p[883]  = a_in_p[254];
+	a_out_p[1011] = a_in_p[255];
+	a_out_p[4]    = a_in_p[256];
+	a_out_p[132]  = a_in_p[257];
+	a_out_p[260]  = a_in_p[258];
+	a_out_p[388]  = a_in_p[259];
+	a_out_p[516]  = a_in_p[260];
+	a_out_p[644]  = a_in_p[261];
+	a_out_p[772]  = a_in_p[262];
+	a_out_p[900]  = a_in_p[263];
+	a_out_p[68]   = a_in_p[264];
+	a_out_p[196]  = a_in_p[265];
+	a_out_p[324]  = a_in_p[266];
+	a_out_p[452]  = a_in_p[267];
+	a_out_p[580]  = a_in_p[268];
+	a_out_p[708]  = a_in_p[269];
+	a_out_p[836]  = a_in_p[270];
+	a_out_p[964]  = a_in_p[271];
+	a_out_p[36]   = a_in_p[272];
+	a_out_p[164]  = a_in_p[273];
+	a_out_p[292]  = a_in_p[274];
+	a_out_p[420]  = a_in_p[275];
+	a_out_p[548]  = a_in_p[276];
+	a_out_p[676]  = a_in_p[277];
+	a_out_p[804]  = a_in_p[278];
+	a_out_p[932]  = a_in_p[279];
+	a_out_p[100]  = a_in_p[280];
+	a_out_p[228]  = a_in_p[281];
+	a_out_p[356]  = a_in_p[282];
+	a_out_p[484]  = a_in_p[283];
+	a_out_p[612]  = a_in_p[284];
+	a_out_p[740]  = a_in_p[285];
+	a_out_p[868]  = a_in_p[286];
+	a_out_p[996]  = a_in_p[287];
+	a_out_p[20]   = a_in_p[288];
+	a_out_p[148]  = a_in_p[289];
+	a_out_p[276]  = a_in_p[290];
+	a_out_p[404]  = a_in_p[291];
+	a_out_p[532]  = a_in_p[292];
+	a_out_p[660]  = a_in_p[293];
+	a_out_p[788]  = a_in_p[294];
+	a_out_p[916]  = a_in_p[295];
+	a_out_p[84]   = a_in_p[296];
+	a_out_p[212]  = a_in_p[297];
+	a_out_p[340]  = a_in_p[298];
+	a_out_p[468]  = a_in_p[299];
+	a_out_p[596]  = a_in_p[300];
+	a_out_p[724]  = a_in_p[301];
+	a_out_p[852]  = a_in_p[302];
+	a_out_p[980]  = a_in_p[303];
+	a_out_p[52]   = a_in_p[304];
+	a_out_p[180]  = a_in_p[305];
+	a_out_p[308]  = a_in_p[306];
+	a_out_p[436]  = a_in_p[307];
+	a_out_p[564]  = a_in_p[308];
+	a_out_p[692]  = a_in_p[309];
+	a_out_p[820]  = a_in_p[310];
+	a_out_p[948]  = a_in_p[311];
+	a_out_p[116]  = a_in_p[312];
+	a_out_p[244]  = a_in_p[313];
+	a_out_p[372]  = a_in_p[314];
+	a_out_p[500]  = a_in_p[315];
+	a_out_p[628]  = a_in_p[316];
+	a_out_p[756]  = a_in_p[317];
+	a_out_p[884]  = a_in_p[318];
+	a_out_p[1012] = a_in_p[319];
+	a_out_p[5]    = a_in_p[320];
+	a_out_p[133]  = a_in_p[321];
+	a_out_p[261]  = a_in_p[322];
+	a_out_p[389]  = a_in_p[323];
+	a_out_p[517]  = a_in_p[324];
+	a_out_p[645]  = a_in_p[325];
+	a_out_p[773]  = a_in_p[326];
+	a_out_p[901]  = a_in_p[327];
+	a_out_p[69]   = a_in_p[328];
+	a_out_p[197]  = a_in_p[329];
+	a_out_p[325]  = a_in_p[330];
+	a_out_p[453]  = a_in_p[331];
+	a_out_p[581]  = a_in_p[332];
+	a_out_p[709]  = a_in_p[333];
+	a_out_p[837]  = a_in_p[334];
+	a_out_p[965]  = a_in_p[335];
+	a_out_p[37]   = a_in_p[336];
+	a_out_p[165]  = a_in_p[337];
+	a_out_p[293]  = a_in_p[338];
+	a_out_p[421]  = a_in_p[339];
+	a_out_p[549]  = a_in_p[340];
+	a_out_p[677]  = a_in_p[341];
+	a_out_p[805]  = a_in_p[342];
+	a_out_p[933]  = a_in_p[343];
+	a_out_p[101]  = a_in_p[344];
+	a_out_p[229]  = a_in_p[345];
+	a_out_p[357]  = a_in_p[346];
+	a_out_p[485]  = a_in_p[347];
+	a_out_p[613]  = a_in_p[348];
+	a_out_p[741]  = a_in_p[349];
+	a_out_p[869]  = a_in_p[350];
+	a_out_p[997]  = a_in_p[351];
+	a_out_p[21]   = a_in_p[352];
+	a_out_p[149]  = a_in_p[353];
+	a_out_p[277]  = a_in_p[354];
+	a_out_p[405]  = a_in_p[355];
+	a_out_p[533]  = a_in_p[356];
+	a_out_p[661]  = a_in_p[357];
+	a_out_p[789]  = a_in_p[358];
+	a_out_p[917]  = a_in_p[359];
+	a_out_p[85]   = a_in_p[360];
+	a_out_p[213]  = a_in_p[361];
+	a_out_p[341]  = a_in_p[362];
+	a_out_p[469]  = a_in_p[363];
+	a_out_p[597]  = a_in_p[364];
+	a_out_p[725]  = a_in_p[365];
+	a_out_p[853]  = a_in_p[366];
+	a_out_p[981]  = a_in_p[367];
+	a_out_p[53]   = a_in_p[368];
+	a_out_p[181]  = a_in_p[369];
+	a_out_p[309]  = a_in_p[370];
+	a_out_p[437]  = a_in_p[371];
+	a_out_p[565]  = a_in_p[372];
+	a_out_p[693]  = a_in_p[373];
+	a_out_p[821]  = a_in_p[374];
+	a_out_p[949]  = a_in_p[375];
+	a_out_p[117]  = a_in_p[376];
+	a_out_p[245]  = a_in_p[377];
+	a_out_p[373]  = a_in_p[378];
+	a_out_p[501]  = a_in_p[379];
+	a_out_p[629]  = a_in_p[380];
+	a_out_p[757]  = a_in_p[381];
+	a_out_p[885]  = a_in_p[382];
+	a_out_p[1013] = a_in_p[383];
+	a_out_p[6]    = a_in_p[384];
+	a_out_p[134]  = a_in_p[385];
+	a_out_p[262]  = a_in_p[386];
+	a_out_p[390]  = a_in_p[387];
+	a_out_p[518]  = a_in_p[388];
+	a_out_p[646]  = a_in_p[389];
+	a_out_p[774]  = a_in_p[390];
+	a_out_p[902]  = a_in_p[391];
+	a_out_p[70]   = a_in_p[392];
+	a_out_p[198]  = a_in_p[393];
+	a_out_p[326]  = a_in_p[394];
+	a_out_p[454]  = a_in_p[395];
+	a_out_p[582]  = a_in_p[396];
+	a_out_p[710]  = a_in_p[397];
+	a_out_p[838]  = a_in_p[398];
+	a_out_p[966]  = a_in_p[399];
+	a_out_p[38]   = a_in_p[400];
+	a_out_p[166]  = a_in_p[401];
+	a_out_p[294]  = a_in_p[402];
+	a_out_p[422]  = a_in_p[403];
+	a_out_p[550]  = a_in_p[404];
+	a_out_p[678]  = a_in_p[405];
+	a_out_p[806]  = a_in_p[406];
+	a_out_p[934]  = a_in_p[407];
+	a_out_p[102]  = a_in_p[408];
+	a_out_p[230]  = a_in_p[409];
+	a_out_p[358]  = a_in_p[410];
+	a_out_p[486]  = a_in_p[411];
+	a_out_p[614]  = a_in_p[412];
+	a_out_p[742]  = a_in_p[413];
+	a_out_p[870]  = a_in_p[414];
+	a_out_p[998]  = a_in_p[415];
+	a_out_p[22]   = a_in_p[416];
+	a_out_p[150]  = a_in_p[417];
+	a_out_p[278]  = a_in_p[418];
+	a_out_p[406]  = a_in_p[419];
+	a_out_p[534]  = a_in_p[420];
+	a_out_p[662]  = a_in_p[421];
+	a_out_p[790]  = a_in_p[422];
+	a_out_p[918]  = a_in_p[423];
+	a_out_p[86]   = a_in_p[424];
+	a_out_p[214]  = a_in_p[425];
+	a_out_p[342]  = a_in_p[426];
+	a_out_p[470]  = a_in_p[427];
+	a_out_p[598]  = a_in_p[428];
+	a_out_p[726]  = a_in_p[429];
+	a_out_p[854]  = a_in_p[430];
+	a_out_p[982]  = a_in_p[431];
+	a_out_p[54]   = a_in_p[432];
+	a_out_p[182]  = a_in_p[433];
+	a_out_p[310]  = a_in_p[434];
+	a_out_p[438]  = a_in_p[435];
+	a_out_p[566]  = a_in_p[436];
+	a_out_p[694]  = a_in_p[437];
+	a_out_p[822]  = a_in_p[438];
+	a_out_p[950]  = a_in_p[439];
+	a_out_p[118]  = a_in_p[440];
+	a_out_p[246]  = a_in_p[441];
+	a_out_p[374]  = a_in_p[442];
+	a_out_p[502]  = a_in_p[443];
+	a_out_p[630]  = a_in_p[444];
+	a_out_p[758]  = a_in_p[445];
+	a_out_p[886]  = a_in_p[446];
+	a_out_p[1014] = a_in_p[447];
+	a_out_p[7]    = a_in_p[448];
+	a_out_p[135]  = a_in_p[449];
+	a_out_p[263]  = a_in_p[450];
+	a_out_p[391]  = a_in_p[451];
+	a_out_p[519]  = a_in_p[452];
+	a_out_p[647]  = a_in_p[453];
+	a_out_p[775]  = a_in_p[454];
+	a_out_p[903]  = a_in_p[455];
+	a_out_p[71]   = a_in_p[456];
+	a_out_p[199]  = a_in_p[457];
+	a_out_p[327]  = a_in_p[458];
+	a_out_p[455]  = a_in_p[459];
+	a_out_p[583]  = a_in_p[460];
+	a_out_p[711]  = a_in_p[461];
+	a_out_p[839]  = a_in_p[462];
+	a_out_p[967]  = a_in_p[463];
+	a_out_p[39]   = a_in_p[464];
+	a_out_p[167]  = a_in_p[465];
+	a_out_p[295]  = a_in_p[466];
+	a_out_p[423]  = a_in_p[467];
+	a_out_p[551]  = a_in_p[468];
+	a_out_p[679]  = a_in_p[469];
+	a_out_p[807]  = a_in_p[470];
+	a_out_p[935]  = a_in_p[471];
+	a_out_p[103]  = a_in_p[472];
+	a_out_p[231]  = a_in_p[473];
+	a_out_p[359]  = a_in_p[474];
+	a_out_p[487]  = a_in_p[475];
+	a_out_p[615]  = a_in_p[476];
+	a_out_p[743]  = a_in_p[477];
+	a_out_p[871]  = a_in_p[478];
+	a_out_p[999]  = a_in_p[479];
+	a_out_p[23]   = a_in_p[480];
+	a_out_p[151]  = a_in_p[481];
+	a_out_p[279]  = a_in_p[482];
+	a_out_p[407]  = a_in_p[483];
+	a_out_p[535]  = a_in_p[484];
+	a_out_p[663]  = a_in_p[485];
+	a_out_p[791]  = a_in_p[486];
+	a_out_p[919]  = a_in_p[487];
+	a_out_p[87]   = a_in_p[488];
+	a_out_p[215]  = a_in_p[489];
+	a_out_p[343]  = a_in_p[490];
+	a_out_p[471]  = a_in_p[491];
+	a_out_p[599]  = a_in_p[492];
+	a_out_p[727]  = a_in_p[493];
+	a_out_p[855]  = a_in_p[494];
+	a_out_p[983]  = a_in_p[495];
+	a_out_p[55]   = a_in_p[496];
+	a_out_p[183]  = a_in_p[497];
+	a_out_p[311]  = a_in_p[498];
+	a_out_p[439]  = a_in_p[499];
+	a_out_p[567]  = a_in_p[500];
+	a_out_p[695]  = a_in_p[501];
+	a_out_p[823]  = a_in_p[502];
+	a_out_p[951]  = a_in_p[503];
+	a_out_p[119]  = a_in_p[504];
+	a_out_p[247]  = a_in_p[505];
+	a_out_p[375]  = a_in_p[506];
+	a_out_p[503]  = a_in_p[507];
+	a_out_p[631]  = a_in_p[508];
+	a_out_p[759]  = a_in_p[509];
+	a_out_p[887]  = a_in_p[510];
+	a_out_p[1015] = a_in_p[511];
+	a_out_p[8]    = a_in_p[512];
+	a_out_p[136]  = a_in_p[513];
+	a_out_p[264]  = a_in_p[514];
+	a_out_p[392]  = a_in_p[515];
+	a_out_p[520]  = a_in_p[516];
+	a_out_p[648]  = a_in_p[517];
+	a_out_p[776]  = a_in_p[518];
+	a_out_p[904]  = a_in_p[519];
+	a_out_p[72]   = a_in_p[520];
+	a_out_p[200]  = a_in_p[521];
+	a_out_p[328]  = a_in_p[522];
+	a_out_p[456]  = a_in_p[523];
+	a_out_p[584]  = a_in_p[524];
+	a_out_p[712]  = a_in_p[525];
+	a_out_p[840]  = a_in_p[526];
+	a_out_p[968]  = a_in_p[527];
+	a_out_p[40]   = a_in_p[528];
+	a_out_p[168]  = a_in_p[529];
+	a_out_p[296]  = a_in_p[530];
+	a_out_p[424]  = a_in_p[531];
+	a_out_p[552]  = a_in_p[532];
+	a_out_p[680]  = a_in_p[533];
+	a_out_p[808]  = a_in_p[534];
+	a_out_p[936]  = a_in_p[535];
+	a_out_p[104]  = a_in_p[536];
+	a_out_p[232]  = a_in_p[537];
+	a_out_p[360]  = a_in_p[538];
+	a_out_p[488]  = a_in_p[539];
+	a_out_p[616]  = a_in_p[540];
+	a_out_p[744]  = a_in_p[541];
+	a_out_p[872]  = a_in_p[542];
+	a_out_p[1000] = a_in_p[543];
+	a_out_p[24]   = a_in_p[544];
+	a_out_p[152]  = a_in_p[545];
+	a_out_p[280]  = a_in_p[546];
+	a_out_p[408]  = a_in_p[547];
+	a_out_p[536]  = a_in_p[548];
+	a_out_p[664]  = a_in_p[549];
+	a_out_p[792]  = a_in_p[550];
+	a_out_p[920]  = a_in_p[551];
+	a_out_p[88]   = a_in_p[552];
+	a_out_p[216]  = a_in_p[553];
+	a_out_p[344]  = a_in_p[554];
+	a_out_p[472]  = a_in_p[555];
+	a_out_p[600]  = a_in_p[556];
+	a_out_p[728]  = a_in_p[557];
+	a_out_p[856]  = a_in_p[558];
+	a_out_p[984]  = a_in_p[559];
+	a_out_p[56]   = a_in_p[560];
+	a_out_p[184]  = a_in_p[561];
+	a_out_p[312]  = a_in_p[562];
+	a_out_p[440]  = a_in_p[563];
+	a_out_p[568]  = a_in_p[564];
+	a_out_p[696]  = a_in_p[565];
+	a_out_p[824]  = a_in_p[566];
+	a_out_p[952]  = a_in_p[567];
+	a_out_p[120]  = a_in_p[568];
+	a_out_p[248]  = a_in_p[569];
+	a_out_p[376]  = a_in_p[570];
+	a_out_p[504]  = a_in_p[571];
+	a_out_p[632]  = a_in_p[572];
+	a_out_p[760]  = a_in_p[573];
+	a_out_p[888]  = a_in_p[574];
+	a_out_p[1016] = a_in_p[575];
+	a_out_p[9]    = a_in_p[576];
+	a_out_p[137]  = a_in_p[577];
+	a_out_p[265]  = a_in_p[578];
+	a_out_p[393]  = a_in_p[579];
+	a_out_p[521]  = a_in_p[580];
+	a_out_p[649]  = a_in_p[581];
+	a_out_p[777]  = a_in_p[582];
+	a_out_p[905]  = a_in_p[583];
+	a_out_p[73]   = a_in_p[584];
+	a_out_p[201]  = a_in_p[585];
+	a_out_p[329]  = a_in_p[586];
+	a_out_p[457]  = a_in_p[587];
+	a_out_p[585]  = a_in_p[588];
+	a_out_p[713]  = a_in_p[589];
+	a_out_p[841]  = a_in_p[590];
+	a_out_p[969]  = a_in_p[591];
+	a_out_p[41]   = a_in_p[592];
+	a_out_p[169]  = a_in_p[593];
+	a_out_p[297]  = a_in_p[594];
+	a_out_p[425]  = a_in_p[595];
+	a_out_p[553]  = a_in_p[596];
+	a_out_p[681]  = a_in_p[597];
+	a_out_p[809]  = a_in_p[598];
+	a_out_p[937]  = a_in_p[599];
+	a_out_p[105]  = a_in_p[600];
+	a_out_p[233]  = a_in_p[601];
+	a_out_p[361]  = a_in_p[602];
+	a_out_p[489]  = a_in_p[603];
+	a_out_p[617]  = a_in_p[604];
+	a_out_p[745]  = a_in_p[605];
+	a_out_p[873]  = a_in_p[606];
+	a_out_p[1001] = a_in_p[607];
+	a_out_p[25]   = a_in_p[608];
+	a_out_p[153]  = a_in_p[609];
+	a_out_p[281]  = a_in_p[610];
+	a_out_p[409]  = a_in_p[611];
+	a_out_p[537]  = a_in_p[612];
+	a_out_p[665]  = a_in_p[613];
+	a_out_p[793]  = a_in_p[614];
+	a_out_p[921]  = a_in_p[615];
+	a_out_p[89]   = a_in_p[616];
+	a_out_p[217]  = a_in_p[617];
+	a_out_p[345]  = a_in_p[618];
+	a_out_p[473]  = a_in_p[619];
+	a_out_p[601]  = a_in_p[620];
+	a_out_p[729]  = a_in_p[621];
+	a_out_p[857]  = a_in_p[622];
+	a_out_p[985]  = a_in_p[623];
+	a_out_p[57]   = a_in_p[624];
+	a_out_p[185]  = a_in_p[625];
+	a_out_p[313]  = a_in_p[626];
+	a_out_p[441]  = a_in_p[627];
+	a_out_p[569]  = a_in_p[628];
+	a_out_p[697]  = a_in_p[629];
+	a_out_p[825]  = a_in_p[630];
+	a_out_p[953]  = a_in_p[631];
+	a_out_p[121]  = a_in_p[632];
+	a_out_p[249]  = a_in_p[633];
+	a_out_p[377]  = a_in_p[634];
+	a_out_p[505]  = a_in_p[635];
+	a_out_p[633]  = a_in_p[636];
+	a_out_p[761]  = a_in_p[637];
+	a_out_p[889]  = a_in_p[638];
+	a_out_p[1017] = a_in_p[639];
+	a_out_p[10]   = a_in_p[640];
+	a_out_p[138]  = a_in_p[641];
+	a_out_p[266]  = a_in_p[642];
+	a_out_p[394]  = a_in_p[643];
+	a_out_p[522]  = a_in_p[644];
+	a_out_p[650]  = a_in_p[645];
+	a_out_p[778]  = a_in_p[646];
+	a_out_p[906]  = a_in_p[647];
+	a_out_p[74]   = a_in_p[648];
+	a_out_p[202]  = a_in_p[649];
+	a_out_p[330]  = a_in_p[650];
+	a_out_p[458]  = a_in_p[651];
+	a_out_p[586]  = a_in_p[652];
+	a_out_p[714]  = a_in_p[653];
+	a_out_p[842]  = a_in_p[654];
+	a_out_p[970]  = a_in_p[655];
+	a_out_p[42]   = a_in_p[656];
+	a_out_p[170]  = a_in_p[657];
+	a_out_p[298]  = a_in_p[658];
+	a_out_p[426]  = a_in_p[659];
+	a_out_p[554]  = a_in_p[660];
+	a_out_p[682]  = a_in_p[661];
+	a_out_p[810]  = a_in_p[662];
+	a_out_p[938]  = a_in_p[663];
+	a_out_p[106]  = a_in_p[664];
+	a_out_p[234]  = a_in_p[665];
+	a_out_p[362]  = a_in_p[666];
+	a_out_p[490]  = a_in_p[667];
+	a_out_p[618]  = a_in_p[668];
+	a_out_p[746]  = a_in_p[669];
+	a_out_p[874]  = a_in_p[670];
+	a_out_p[1002] = a_in_p[671];
+	a_out_p[26]   = a_in_p[672];
+	a_out_p[154]  = a_in_p[673];
+	a_out_p[282]  = a_in_p[674];
+	a_out_p[410]  = a_in_p[675];
+	a_out_p[538]  = a_in_p[676];
+	a_out_p[666]  = a_in_p[677];
+	a_out_p[794]  = a_in_p[678];
+	a_out_p[922]  = a_in_p[679];
+	a_out_p[90]   = a_in_p[680];
+	a_out_p[218]  = a_in_p[681];
+	a_out_p[346]  = a_in_p[682];
+	a_out_p[474]  = a_in_p[683];
+	a_out_p[602]  = a_in_p[684];
+	a_out_p[730]  = a_in_p[685];
+	a_out_p[858]  = a_in_p[686];
+	a_out_p[986]  = a_in_p[687];
+	a_out_p[58]   = a_in_p[688];
+	a_out_p[186]  = a_in_p[689];
+	a_out_p[314]  = a_in_p[690];
+	a_out_p[442]  = a_in_p[691];
+	a_out_p[570]  = a_in_p[692];
+	a_out_p[698]  = a_in_p[693];
+	a_out_p[826]  = a_in_p[694];
+	a_out_p[954]  = a_in_p[695];
+	a_out_p[122]  = a_in_p[696];
+	a_out_p[250]  = a_in_p[697];
+	a_out_p[378]  = a_in_p[698];
+	a_out_p[506]  = a_in_p[699];
+	a_out_p[634]  = a_in_p[700];
+	a_out_p[762]  = a_in_p[701];
+	a_out_p[890]  = a_in_p[702];
+	a_out_p[1018] = a_in_p[703];
+	a_out_p[11]   = a_in_p[704];
+	a_out_p[139]  = a_in_p[705];
+	a_out_p[267]  = a_in_p[706];
+	a_out_p[395]  = a_in_p[707];
+	a_out_p[523]  = a_in_p[708];
+	a_out_p[651]  = a_in_p[709];
+	a_out_p[779]  = a_in_p[710];
+	a_out_p[907]  = a_in_p[711];
+	a_out_p[75]   = a_in_p[712];
+	a_out_p[203]  = a_in_p[713];
+	a_out_p[331]  = a_in_p[714];
+	a_out_p[459]  = a_in_p[715];
+	a_out_p[587]  = a_in_p[716];
+	a_out_p[715]  = a_in_p[717];
+	a_out_p[843]  = a_in_p[718];
+	a_out_p[971]  = a_in_p[719];
+	a_out_p[43]   = a_in_p[720];
+	a_out_p[171]  = a_in_p[721];
+	a_out_p[299]  = a_in_p[722];
+	a_out_p[427]  = a_in_p[723];
+	a_out_p[555]  = a_in_p[724];
+	a_out_p[683]  = a_in_p[725];
+	a_out_p[811]  = a_in_p[726];
+	a_out_p[939]  = a_in_p[727];
+	a_out_p[107]  = a_in_p[728];
+	a_out_p[235]  = a_in_p[729];
+	a_out_p[363]  = a_in_p[730];
+	a_out_p[491]  = a_in_p[731];
+	a_out_p[619]  = a_in_p[732];
+	a_out_p[747]  = a_in_p[733];
+	a_out_p[875]  = a_in_p[734];
+	a_out_p[1003] = a_in_p[735];
+	a_out_p[27]   = a_in_p[736];
+	a_out_p[155]  = a_in_p[737];
+	a_out_p[283]  = a_in_p[738];
+	a_out_p[411]  = a_in_p[739];
+	a_out_p[539]  = a_in_p[740];
+	a_out_p[667]  = a_in_p[741];
+	a_out_p[795]  = a_in_p[742];
+	a_out_p[923]  = a_in_p[743];
+	a_out_p[91]   = a_in_p[744];
+	a_out_p[219]  = a_in_p[745];
+	a_out_p[347]  = a_in_p[746];
+	a_out_p[475]  = a_in_p[747];
+	a_out_p[603]  = a_in_p[748];
+	a_out_p[731]  = a_in_p[749];
+	a_out_p[859]  = a_in_p[750];
+	a_out_p[987]  = a_in_p[751];
+	a_out_p[59]   = a_in_p[752];
+	a_out_p[187]  = a_in_p[753];
+	a_out_p[315]  = a_in_p[754];
+	a_out_p[443]  = a_in_p[755];
+	a_out_p[571]  = a_in_p[756];
+	a_out_p[699]  = a_in_p[757];
+	a_out_p[827]  = a_in_p[758];
+	a_out_p[955]  = a_in_p[759];
+	a_out_p[123]  = a_in_p[760];
+	a_out_p[251]  = a_in_p[761];
+	a_out_p[379]  = a_in_p[762];
+	a_out_p[507]  = a_in_p[763];
+	a_out_p[635]  = a_in_p[764];
+	a_out_p[763]  = a_in_p[765];
+	a_out_p[891]  = a_in_p[766];
+	a_out_p[1019] = a_in_p[767];
+	a_out_p[12]   = a_in_p[768];
+	a_out_p[140]  = a_in_p[769];
+	a_out_p[268]  = a_in_p[770];
+	a_out_p[396]  = a_in_p[771];
+	a_out_p[524]  = a_in_p[772];
+	a_out_p[652]  = a_in_p[773];
+	a_out_p[780]  = a_in_p[774];
+	a_out_p[908]  = a_in_p[775];
+	a_out_p[76]   = a_in_p[776];
+	a_out_p[204]  = a_in_p[777];
+	a_out_p[332]  = a_in_p[778];
+	a_out_p[460]  = a_in_p[779];
+	a_out_p[588]  = a_in_p[780];
+	a_out_p[716]  = a_in_p[781];
+	a_out_p[844]  = a_in_p[782];
+	a_out_p[972]  = a_in_p[783];
+	a_out_p[44]   = a_in_p[784];
+	a_out_p[172]  = a_in_p[785];
+	a_out_p[300]  = a_in_p[786];
+	a_out_p[428]  = a_in_p[787];
+	a_out_p[556]  = a_in_p[788];
+	a_out_p[684]  = a_in_p[789];
+	a_out_p[812]  = a_in_p[790];
+	a_out_p[940]  = a_in_p[791];
+	a_out_p[108]  = a_in_p[792];
+	a_out_p[236]  = a_in_p[793];
+	a_out_p[364]  = a_in_p[794];
+	a_out_p[492]  = a_in_p[795];
+	a_out_p[620]  = a_in_p[796];
+	a_out_p[748]  = a_in_p[797];
+	a_out_p[876]  = a_in_p[798];
+	a_out_p[1004] = a_in_p[799];
+	a_out_p[28]   = a_in_p[800];
+	a_out_p[156]  = a_in_p[801];
+	a_out_p[284]  = a_in_p[802];
+	a_out_p[412]  = a_in_p[803];
+	a_out_p[540]  = a_in_p[804];
+	a_out_p[668]  = a_in_p[805];
+	a_out_p[796]  = a_in_p[806];
+	a_out_p[924]  = a_in_p[807];
+	a_out_p[92]   = a_in_p[808];
+	a_out_p[220]  = a_in_p[809];
+	a_out_p[348]  = a_in_p[810];
+	a_out_p[476]  = a_in_p[811];
+	a_out_p[604]  = a_in_p[812];
+	a_out_p[732]  = a_in_p[813];
+	a_out_p[860]  = a_in_p[814];
+	a_out_p[988]  = a_in_p[815];
+	a_out_p[60]   = a_in_p[816];
+	a_out_p[188]  = a_in_p[817];
+	a_out_p[316]  = a_in_p[818];
+	a_out_p[444]  = a_in_p[819];
+	a_out_p[572]  = a_in_p[820];
+	a_out_p[700]  = a_in_p[821];
+	a_out_p[828]  = a_in_p[822];
+	a_out_p[956]  = a_in_p[823];
+	a_out_p[124]  = a_in_p[824];
+	a_out_p[252]  = a_in_p[825];
+	a_out_p[380]  = a_in_p[826];
+	a_out_p[508]  = a_in_p[827];
+	a_out_p[636]  = a_in_p[828];
+	a_out_p[764]  = a_in_p[829];
+	a_out_p[892]  = a_in_p[830];
+	a_out_p[1020] = a_in_p[831];
+	a_out_p[13]   = a_in_p[832];
+	a_out_p[141]  = a_in_p[833];
+	a_out_p[269]  = a_in_p[834];
+	a_out_p[397]  = a_in_p[835];
+	a_out_p[525]  = a_in_p[836];
+	a_out_p[653]  = a_in_p[837];
+	a_out_p[781]  = a_in_p[838];
+	a_out_p[909]  = a_in_p[839];
+	a_out_p[77]   = a_in_p[840];
+	a_out_p[205]  = a_in_p[841];
+	a_out_p[333]  = a_in_p[842];
+	a_out_p[461]  = a_in_p[843];
+	a_out_p[589]  = a_in_p[844];
+	a_out_p[717]  = a_in_p[845];
+	a_out_p[845]  = a_in_p[846];
+	a_out_p[973]  = a_in_p[847];
+	a_out_p[45]   = a_in_p[848];
+	a_out_p[173]  = a_in_p[849];
+	a_out_p[301]  = a_in_p[850];
+	a_out_p[429]  = a_in_p[851];
+	a_out_p[557]  = a_in_p[852];
+	a_out_p[685]  = a_in_p[853];
+	a_out_p[813]  = a_in_p[854];
+	a_out_p[941]  = a_in_p[855];
+	a_out_p[109]  = a_in_p[856];
+	a_out_p[237]  = a_in_p[857];
+	a_out_p[365]  = a_in_p[858];
+	a_out_p[493]  = a_in_p[859];
+	a_out_p[621]  = a_in_p[860];
+	a_out_p[749]  = a_in_p[861];
+	a_out_p[877]  = a_in_p[862];
+	a_out_p[1005] = a_in_p[863];
+	a_out_p[29]   = a_in_p[864];
+	a_out_p[157]  = a_in_p[865];
+	a_out_p[285]  = a_in_p[866];
+	a_out_p[413]  = a_in_p[867];
+	a_out_p[541]  = a_in_p[868];
+	a_out_p[669]  = a_in_p[869];
+	a_out_p[797]  = a_in_p[870];
+	a_out_p[925]  = a_in_p[871];
+	a_out_p[93]   = a_in_p[872];
+	a_out_p[221]  = a_in_p[873];
+	a_out_p[349]  = a_in_p[874];
+	a_out_p[477]  = a_in_p[875];
+	a_out_p[605]  = a_in_p[876];
+	a_out_p[733]  = a_in_p[877];
+	a_out_p[861]  = a_in_p[878];
+	a_out_p[989]  = a_in_p[879];
+	a_out_p[61]   = a_in_p[880];
+	a_out_p[189]  = a_in_p[881];
+	a_out_p[317]  = a_in_p[882];
+	a_out_p[445]  = a_in_p[883];
+	a_out_p[573]  = a_in_p[884];
+	a_out_p[701]  = a_in_p[885];
+	a_out_p[829]  = a_in_p[886];
+	a_out_p[957]  = a_in_p[887];
+	a_out_p[125]  = a_in_p[888];
+	a_out_p[253]  = a_in_p[889];
+	a_out_p[381]  = a_in_p[890];
+	a_out_p[509]  = a_in_p[891];
+	a_out_p[637]  = a_in_p[892];
+	a_out_p[765]  = a_in_p[893];
+	a_out_p[893]  = a_in_p[894];
+	a_out_p[1021] = a_in_p[895];
+	a_out_p[14]   = a_in_p[896];
+	a_out_p[142]  = a_in_p[897];
+	a_out_p[270]  = a_in_p[898];
+	a_out_p[398]  = a_in_p[899];
+	a_out_p[526]  = a_in_p[900];
+	a_out_p[654]  = a_in_p[901];
+	a_out_p[782]  = a_in_p[902];
+	a_out_p[910]  = a_in_p[903];
+	a_out_p[78]   = a_in_p[904];
+	a_out_p[206]  = a_in_p[905];
+	a_out_p[334]  = a_in_p[906];
+	a_out_p[462]  = a_in_p[907];
+	a_out_p[590]  = a_in_p[908];
+	a_out_p[718]  = a_in_p[909];
+	a_out_p[846]  = a_in_p[910];
+	a_out_p[974]  = a_in_p[911];
+	a_out_p[46]   = a_in_p[912];
+	a_out_p[174]  = a_in_p[913];
+	a_out_p[302]  = a_in_p[914];
+	a_out_p[430]  = a_in_p[915];
+	a_out_p[558]  = a_in_p[916];
+	a_out_p[686]  = a_in_p[917];
+	a_out_p[814]  = a_in_p[918];
+	a_out_p[942]  = a_in_p[919];
+	a_out_p[110]  = a_in_p[920];
+	a_out_p[238]  = a_in_p[921];
+	a_out_p[366]  = a_in_p[922];
+	a_out_p[494]  = a_in_p[923];
+	a_out_p[622]  = a_in_p[924];
+	a_out_p[750]  = a_in_p[925];
+	a_out_p[878]  = a_in_p[926];
+	a_out_p[1006] = a_in_p[927];
+	a_out_p[30]   = a_in_p[928];
+	a_out_p[158]  = a_in_p[929];
+	a_out_p[286]  = a_in_p[930];
+	a_out_p[414]  = a_in_p[931];
+	a_out_p[542]  = a_in_p[932];
+	a_out_p[670]  = a_in_p[933];
+	a_out_p[798]  = a_in_p[934];
+	a_out_p[926]  = a_in_p[935];
+	a_out_p[94]   = a_in_p[936];
+	a_out_p[222]  = a_in_p[937];
+	a_out_p[350]  = a_in_p[938];
+	a_out_p[478]  = a_in_p[939];
+	a_out_p[606]  = a_in_p[940];
+	a_out_p[734]  = a_in_p[941];
+	a_out_p[862]  = a_in_p[942];
+	a_out_p[990]  = a_in_p[943];
+	a_out_p[62]   = a_in_p[944];
+	a_out_p[190]  = a_in_p[945];
+	a_out_p[318]  = a_in_p[946];
+	a_out_p[446]  = a_in_p[947];
+	a_out_p[574]  = a_in_p[948];
+	a_out_p[702]  = a_in_p[949];
+	a_out_p[830]  = a_in_p[950];
+	a_out_p[958]  = a_in_p[951];
+	a_out_p[126]  = a_in_p[952];
+	a_out_p[254]  = a_in_p[953];
+	a_out_p[382]  = a_in_p[954];
+	a_out_p[510]  = a_in_p[955];
+	a_out_p[638]  = a_in_p[956];
+	a_out_p[766]  = a_in_p[957];
+	a_out_p[894]  = a_in_p[958];
+	a_out_p[1022] = a_in_p[959];
+	a_out_p[15]   = a_in_p[960];
+	a_out_p[143]  = a_in_p[961];
+	a_out_p[271]  = a_in_p[962];
+	a_out_p[399]  = a_in_p[963];
+	a_out_p[527]  = a_in_p[964];
+	a_out_p[655]  = a_in_p[965];
+	a_out_p[783]  = a_in_p[966];
+	a_out_p[911]  = a_in_p[967];
+	a_out_p[79]   = a_in_p[968];
+	a_out_p[207]  = a_in_p[969];
+	a_out_p[335]  = a_in_p[970];
+	a_out_p[463]  = a_in_p[971];
+	a_out_p[591]  = a_in_p[972];
+	a_out_p[719]  = a_in_p[973];
+	a_out_p[847]  = a_in_p[974];
+	a_out_p[975]  = a_in_p[975];
+	a_out_p[47]   = a_in_p[976];
+	a_out_p[175]  = a_in_p[977];
+	a_out_p[303]  = a_in_p[978];
+	a_out_p[431]  = a_in_p[979];
+	a_out_p[559]  = a_in_p[980];
+	a_out_p[687]  = a_in_p[981];
+	a_out_p[815]  = a_in_p[982];
+	a_out_p[943]  = a_in_p[983];
+	a_out_p[111]  = a_in_p[984];
+	a_out_p[239]  = a_in_p[985];
+	a_out_p[367]  = a_in_p[986];
+	a_out_p[495]  = a_in_p[987];
+	a_out_p[623]  = a_in_p[988];
+	a_out_p[751]  = a_in_p[989];
+	a_out_p[879]  = a_in_p[990];
+	a_out_p[1007] = a_in_p[991];
+	a_out_p[31]   = a_in_p[992];
+	a_out_p[159]  = a_in_p[993];
+	a_out_p[287]  = a_in_p[994];
+	a_out_p[415]  = a_in_p[995];
+	a_out_p[543]  = a_in_p[996];
+	a_out_p[671]  = a_in_p[997];
+	a_out_p[799]  = a_in_p[998];
+	a_out_p[927]  = a_in_p[999];
+	a_out_p[95]   = a_in_p[1000];
+	a_out_p[223]  = a_in_p[1001];
+	a_out_p[351]  = a_in_p[1002];
+	a_out_p[479]  = a_in_p[1003];
+	a_out_p[607]  = a_in_p[1004];
+	a_out_p[735]  = a_in_p[1005];
+	a_out_p[863]  = a_in_p[1006];
+	a_out_p[991]  = a_in_p[1007];
+	a_out_p[63]   = a_in_p[1008];
+	a_out_p[191]  = a_in_p[1009];
+	a_out_p[319]  = a_in_p[1010];
+	a_out_p[447]  = a_in_p[1011];
+	a_out_p[575]  = a_in_p[1012];
+	a_out_p[703]  = a_in_p[1013];
+	a_out_p[831]  = a_in_p[1014];
+	a_out_p[959]  = a_in_p[1015];
+	a_out_p[127]  = a_in_p[1016];
+	a_out_p[255]  = a_in_p[1017];
+	a_out_p[383]  = a_in_p[1018];
+	a_out_p[511]  = a_in_p[1019];
+	a_out_p[639]  = a_in_p[1020];
+	a_out_p[767]  = a_in_p[1021];
+	a_out_p[895]  = a_in_p[1022];
+	a_out_p[1023] = a_in_p[1023];
+}
+}}} // namespace generated::transpose::fallback::scalar
+// NOLINTEND
diff --git a/fastlanes/src/unrsum.cpp b/fastlanes/src/unrsum.cpp
new file mode 100644
index 0000000..9e92327
--- /dev/null
+++ b/fastlanes/src/unrsum.cpp
@@ -0,0 +1,523 @@
+// generated!
+// NOLINTBEGIN
+#include "fls_gen/macros.hpp"
+#include "fls_gen/unrsum/unrsum.hpp"
+namespace generated { namespace unrsum::fallback { namespace scalar {
+void unrsum(const uint8_t* a_in_p, uint8_t* a_out_p) {
+	[[maybe_unused]] auto       out = reinterpret_cast<uint8_t*>(a_out_p);
+	[[maybe_unused]] const auto in  = reinterpret_cast<const uint8_t*>(a_in_p);
+	[[maybe_unused]] uint8_t    register_0_0;
+	[[maybe_unused]] uint8_t    register_0_1;
+	[[maybe_unused]] uint8_t    tmp_0;
+	for (int i = 0; i < 128; ++i) {
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 0)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 1)];
+		out[(0 * 128) + (i * 1) + (0)] = 0;
+		tmp_0                          = register_0_1 - register_0_0;
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 1)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 2)];
+		out[(i * 1) + (0 * 128) + 128] = tmp_0;
+		tmp_0                          = register_0_1 - register_0_0;
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 2)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 3)];
+		out[(i * 1) + (0 * 128) + 256] = tmp_0;
+		tmp_0                          = register_0_1 - register_0_0;
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 3)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 4)];
+		out[(i * 1) + (0 * 128) + 384] = tmp_0;
+		tmp_0                          = register_0_1 - register_0_0;
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 4)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 5)];
+		out[(i * 1) + (0 * 128) + 512] = tmp_0;
+		tmp_0                          = register_0_1 - register_0_0;
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 5)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 6)];
+		out[(i * 1) + (0 * 128) + 640] = tmp_0;
+		tmp_0                          = register_0_1 - register_0_0;
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 6)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 7)];
+		out[(i * 1) + (0 * 128) + 768] = tmp_0;
+		tmp_0                          = register_0_1 - register_0_0;
+		register_0_0                   = in[(0 * 128) + (i * 1) + (128 * 7)];
+		register_0_1                   = in[(0 * 128) + (i * 1) + (128 * 8)];
+		out[(i * 1) + (0 * 128) + 896] = tmp_0;
+	}
+}
+void unrsum_inplace(uint8_t* a_in_p) { unrsum(const_cast<const uint8_t*>(a_in_p), a_in_p); }
+void unrsum(const uint16_t* a_in_p, uint16_t* a_out_p) {
+	[[maybe_unused]] auto       out = reinterpret_cast<uint16_t*>(a_out_p);
+	[[maybe_unused]] const auto in  = reinterpret_cast<const uint16_t*>(a_in_p);
+	[[maybe_unused]] uint16_t   register_0_0;
+	[[maybe_unused]] uint16_t   register_0_1;
+	[[maybe_unused]] uint16_t   tmp_0;
+	for (int i = 0; i < 64; ++i) {
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 0)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 1)];
+		out[(0 * 64) + (i * 1) + (0)] = 0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 1)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 2)];
+		out[(i * 1) + (0 * 64) + 128] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 2)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 3)];
+		out[(i * 1) + (0 * 64) + 256] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 3)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 4)];
+		out[(i * 1) + (0 * 64) + 384] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 4)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 5)];
+		out[(i * 1) + (0 * 64) + 512] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 5)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 6)];
+		out[(i * 1) + (0 * 64) + 640] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 6)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 7)];
+		out[(i * 1) + (0 * 64) + 768] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 64) + (i * 1) + (128 * 7)];
+		register_0_1                  = in[(0 * 64) + (i * 1) + (128 * 8)];
+		out[(i * 1) + (0 * 64) + 896] = tmp_0;
+		register_0_1                  = in[64 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 0) + i];
+		a_out_p[64 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 1) + i];
+		a_out_p[64 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 2) + i];
+		a_out_p[64 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 3) + i];
+		a_out_p[64 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 4) + i];
+		a_out_p[64 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 5) + i];
+		a_out_p[64 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 6) + i];
+		a_out_p[64 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 7) + i];
+		a_out_p[64 + (128 * 7) + i]   = tmp_0;
+	}
+}
+void unrsum_inplace(uint16_t* a_in_p) { unrsum(const_cast<const uint16_t*>(a_in_p), a_in_p); }
+void unrsum(const uint32_t* a_in_p, uint32_t* a_out_p) {
+	[[maybe_unused]] auto       out = reinterpret_cast<uint32_t*>(a_out_p);
+	[[maybe_unused]] const auto in  = reinterpret_cast<const uint32_t*>(a_in_p);
+	[[maybe_unused]] uint32_t   register_0_0;
+	[[maybe_unused]] uint32_t   register_0_1;
+	[[maybe_unused]] uint32_t   tmp_0;
+	for (int i = 0; i < 32; ++i) {
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 0)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 1)];
+		out[(0 * 32) + (i * 1) + (0)] = 0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 1)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 2)];
+		out[(i * 1) + (0 * 32) + 128] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 2)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 3)];
+		out[(i * 1) + (0 * 32) + 256] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 3)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 4)];
+		out[(i * 1) + (0 * 32) + 384] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 4)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 5)];
+		out[(i * 1) + (0 * 32) + 512] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 5)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 6)];
+		out[(i * 1) + (0 * 32) + 640] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 6)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 7)];
+		out[(i * 1) + (0 * 32) + 768] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 32) + (i * 1) + (128 * 7)];
+		register_0_1                  = in[(0 * 32) + (i * 1) + (128 * 8)];
+		out[(i * 1) + (0 * 32) + 896] = tmp_0;
+		register_0_1                  = in[64 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 0) + i];
+		a_out_p[64 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 1) + i];
+		a_out_p[64 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 2) + i];
+		a_out_p[64 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 3) + i];
+		a_out_p[64 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 4) + i];
+		a_out_p[64 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 5) + i];
+		a_out_p[64 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 6) + i];
+		a_out_p[64 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 7) + i];
+		a_out_p[64 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 0) + i];
+		a_out_p[32 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 1) + i];
+		a_out_p[32 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 2) + i];
+		a_out_p[32 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 3) + i];
+		a_out_p[32 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 4) + i];
+		a_out_p[32 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 5) + i];
+		a_out_p[32 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 6) + i];
+		a_out_p[32 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 7) + i];
+		a_out_p[32 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 0) + i];
+		a_out_p[96 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 1) + i];
+		a_out_p[96 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 2) + i];
+		a_out_p[96 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 3) + i];
+		a_out_p[96 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 4) + i];
+		a_out_p[96 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 5) + i];
+		a_out_p[96 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 6) + i];
+		a_out_p[96 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 7) + i];
+		a_out_p[96 + (128 * 7) + i]   = tmp_0;
+	}
+}
+void unrsum_inplace(uint32_t* a_in_p) { unrsum(const_cast<const uint32_t*>(a_in_p), a_in_p); }
+void unrsum(const uint64_t* a_in_p, uint64_t* a_out_p) {
+	[[maybe_unused]] auto       out = reinterpret_cast<uint64_t*>(a_out_p);
+	[[maybe_unused]] const auto in  = reinterpret_cast<const uint64_t*>(a_in_p);
+	[[maybe_unused]] uint64_t   register_0_0;
+	[[maybe_unused]] uint64_t   register_0_1;
+	[[maybe_unused]] uint64_t   tmp_0;
+	for (int i = 0; i < 16; ++i) {
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 0)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 1)];
+		out[(0 * 16) + (i * 1) + (0)] = 0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 1)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 2)];
+		out[(i * 1) + (0 * 16) + 128] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 2)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 3)];
+		out[(i * 1) + (0 * 16) + 256] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 3)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 4)];
+		out[(i * 1) + (0 * 16) + 384] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 4)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 5)];
+		out[(i * 1) + (0 * 16) + 512] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 5)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 6)];
+		out[(i * 1) + (0 * 16) + 640] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 6)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 7)];
+		out[(i * 1) + (0 * 16) + 768] = tmp_0;
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[(0 * 16) + (i * 1) + (128 * 7)];
+		register_0_1                  = in[(0 * 16) + (i * 1) + (128 * 8)];
+		out[(i * 1) + (0 * 16) + 896] = tmp_0;
+		register_0_1                  = in[64 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 0) + i];
+		a_out_p[64 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 1) + i];
+		a_out_p[64 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 2) + i];
+		a_out_p[64 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 3) + i];
+		a_out_p[64 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 4) + i];
+		a_out_p[64 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 5) + i];
+		a_out_p[64 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 6) + i];
+		a_out_p[64 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[64 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[64 + (128 * 7) + i];
+		a_out_p[64 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 0) + i];
+		a_out_p[32 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 1) + i];
+		a_out_p[32 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 2) + i];
+		a_out_p[32 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 3) + i];
+		a_out_p[32 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 4) + i];
+		a_out_p[32 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 5) + i];
+		a_out_p[32 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 6) + i];
+		a_out_p[32 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[32 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[32 + (128 * 7) + i];
+		a_out_p[32 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 0) + i];
+		a_out_p[96 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 1) + i];
+		a_out_p[96 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 2) + i];
+		a_out_p[96 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 3) + i];
+		a_out_p[96 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 4) + i];
+		a_out_p[96 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 5) + i];
+		a_out_p[96 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 6) + i];
+		a_out_p[96 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[96 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[96 + (128 * 7) + i];
+		a_out_p[96 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 0) + i];
+		a_out_p[16 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 1) + i];
+		a_out_p[16 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 2) + i];
+		a_out_p[16 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 3) + i];
+		a_out_p[16 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 4) + i];
+		a_out_p[16 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 5) + i];
+		a_out_p[16 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 6) + i];
+		a_out_p[16 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[16 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[16 + (128 * 7) + i];
+		a_out_p[16 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 0) + i];
+		a_out_p[80 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 1) + i];
+		a_out_p[80 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 2) + i];
+		a_out_p[80 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 3) + i];
+		a_out_p[80 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 4) + i];
+		a_out_p[80 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 5) + i];
+		a_out_p[80 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 6) + i];
+		a_out_p[80 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[80 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[80 + (128 * 7) + i];
+		a_out_p[80 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 0) + i];
+		a_out_p[48 + (128 * 0) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 1) + i];
+		a_out_p[48 + (128 * 1) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 2) + i];
+		a_out_p[48 + (128 * 2) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 3) + i];
+		a_out_p[48 + (128 * 3) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 4) + i];
+		a_out_p[48 + (128 * 4) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 5) + i];
+		a_out_p[48 + (128 * 5) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 6) + i];
+		a_out_p[48 + (128 * 6) + i]   = tmp_0;
+		register_0_1                  = in[48 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[48 + (128 * 7) + i];
+		a_out_p[48 + (128 * 7) + i]   = tmp_0;
+		register_0_1                  = in[112 + (128 * 0) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 0) + i];
+		a_out_p[112 + (128 * 0) + i]  = tmp_0;
+		register_0_1                  = in[112 + (128 * 1) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 1) + i];
+		a_out_p[112 + (128 * 1) + i]  = tmp_0;
+		register_0_1                  = in[112 + (128 * 2) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 2) + i];
+		a_out_p[112 + (128 * 2) + i]  = tmp_0;
+		register_0_1                  = in[112 + (128 * 3) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 3) + i];
+		a_out_p[112 + (128 * 3) + i]  = tmp_0;
+		register_0_1                  = in[112 + (128 * 4) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 4) + i];
+		a_out_p[112 + (128 * 4) + i]  = tmp_0;
+		register_0_1                  = in[112 + (128 * 5) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 5) + i];
+		a_out_p[112 + (128 * 5) + i]  = tmp_0;
+		register_0_1                  = in[112 + (128 * 6) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 6) + i];
+		a_out_p[112 + (128 * 6) + i]  = tmp_0;
+		register_0_1                  = in[112 + (128 * 7) + i];
+		tmp_0                         = register_0_1 - register_0_0;
+		register_0_0                  = in[112 + (128 * 7) + i];
+		a_out_p[112 + (128 * 7) + i]  = tmp_0;
+	}
+}
+void unrsum_inplace(uint64_t* a_in_p) { unrsum(const_cast<const uint64_t*>(a_in_p), a_in_p); }
+}}} // namespace generated::unrsum::fallback::scalar
+// NOLINTEND
diff --git a/include/cub/test/CMakeLists.txt b/include/cub/test/CMakeLists.txt
new file mode 100644
index 0000000..bba5b6c
--- /dev/null
+++ b/include/cub/test/CMakeLists.txt
@@ -0,0 +1,367 @@
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # NVBugs 200770766
+  set(CUB_SEPARATE_CATCH2 ON)
+else()
+  option(CUB_SEPARATE_CATCH2
+    "Build each catch2 test as a separate executable."
+    OFF
+  )
+endif()
+
+include("${CUB_SOURCE_DIR}/cmake/CPM.cmake")
+CPMAddPackage("gh:catchorg/Catch2@2.13.9")
+
+option(METAL_BUILD_DOC OFF)
+option(METAL_BUILD_EXAMPLES OFF)
+option(METAL_BUILD_TESTS OFF)
+CPMAddPackage("gh:brunocodutra/metal@2.1.4")
+
+find_package(CUDAToolkit REQUIRED)
+
+# Some tests always build with RDC, so make sure that the sm_XX flags are
+# compatible. See note in CubCudaConfig.cmake.
+# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback
+# tests to build for non-rdc arches. But for now, all files in a given directory
+# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around
+# how CUDA_FLAGS works.
+set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_RDC}")
+
+# The function below reads the filepath `src`, extracts the %PARAM% comments,
+# and fills `labels_var` with a list of `label1_value1.label2_value2...`
+# strings, and puts the corresponding `DEFINITION=value1:DEFINITION=value2`
+# entries into `defs_var`.
+#
+# See the README.md file in this directory for background info.
+function(cub_get_test_params src labels_var defs_var)
+  file(READ "${src}" file_data)
+  set(param_regex "//[ ]+%PARAM%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)")
+
+  string(REGEX MATCHALL
+    "${param_regex}"
+    matches
+    "${file_data}"
+  )
+
+  set(variant_labels)
+  set(variant_defs)
+
+  foreach(match IN LISTS matches)
+    string(REGEX MATCH
+      "${param_regex}"
+      unused
+      "${match}"
+    )
+
+    set(def ${CMAKE_MATCH_1})
+    set(label ${CMAKE_MATCH_2})
+    set(values "${CMAKE_MATCH_3}")
+    string(REPLACE ":" ";" values "${values}")
+
+    # Build lists of test name suffixes (labels) and preprocessor definitions
+    # (defs) containing the cartesian product of all param values:
+    if (NOT variant_labels)
+      foreach(value IN LISTS values)
+        list(APPEND variant_labels ${label}_${value})
+      endforeach()
+    else()
+      set(tmp_labels)
+      foreach(old_label IN LISTS variant_labels)
+        foreach(value IN LISTS values)
+          list(APPEND tmp_labels ${old_label}.${label}_${value})
+        endforeach()
+      endforeach()
+      set(variant_labels "${tmp_labels}")
+    endif()
+
+    if (NOT variant_defs)
+      foreach(value IN LISTS values)
+        list(APPEND variant_defs ${def}=${value})
+      endforeach()
+    else()
+      set(tmp_defs)
+      foreach(old_def IN LISTS variant_defs)
+        foreach(value IN LISTS values)
+          list(APPEND tmp_defs ${old_def}:${def}=${value})
+        endforeach()
+      endforeach()
+      set(variant_defs "${tmp_defs}")
+    endif()
+  endforeach()
+
+  set(${labels_var} "${variant_labels}" PARENT_SCOPE)
+  set(${defs_var} "${variant_defs}" PARENT_SCOPE)
+endfunction()
+
+# Create meta targets that build all tests for a single configuration:
+foreach(cub_target IN LISTS CUB_TARGETS)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+  set(config_meta_target ${config_prefix}.tests)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+file(GLOB test_srcs
+  RELATIVE "${CUB_SOURCE_DIR}/test"
+  CONFIGURE_DEPENDS
+  test_*.cu
+  catch2_test_*.cu
+)
+
+## cub_is_catch2_test
+#
+# If the test_src contains the substring "catch2_test_", `result_var` will
+# be set to TRUE.
+function(cub_is_catch2_test result_var test_src)
+  string(FIND "${test_src}" "catch2_test_" idx)
+  if (idx EQUAL -1)
+    set(${result_var} FALSE PARENT_SCOPE)
+  else()
+    set(${result_var} TRUE PARENT_SCOPE)
+  endif()
+endfunction()
+
+## cub_add_test
+#
+# Add a test executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the test
+#   target. Useful for post-processing target information.
+# test_name: The name of the test minus "<config_prefix>.test." For example,
+#   testing/vector.cu will be "vector", and testing/cuda/copy.cu will be
+#   "cuda.copy".
+# test_src: The source file that implements the test.
+# cub_target: The reference cub target with configuration information.
+#
+function(cub_add_test target_name_var test_name test_src cub_target)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+  cub_is_catch2_test(is_catch2_test "${test_src}")
+
+  # The actual name of the test's target:
+  set(test_target ${config_prefix}.test.${test_name})
+  set(${target_name_var} ${test_target} PARENT_SCOPE)
+
+  set(config_meta_target ${config_prefix}.tests)
+
+  if (is_catch2_test)
+    # Per config helper library:
+    set(config_c2h_target ${config_prefix}.test.catch2_helper)
+    if (NOT TARGET ${config_c2h_target})
+      add_library(${config_c2h_target} STATIC c2h/generators.cu)
+      set_property(TARGET ${config_c2h_target}
+        PROPERTY POSITION_INDEPENDENT_CODE ON
+      )
+
+      if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+        target_link_options(${config_c2h_target} PRIVATE "-cuda")
+        target_compile_options(${config_c2h_target} PRIVATE "-fPIC")
+      endif()
+
+      target_include_directories(${config_c2h_target}
+        PUBLIC "${CUB_SOURCE_DIR}/test"
+      )
+
+      cub_clone_target_properties(${config_c2h_target} ${cub_target})
+      target_link_libraries(${config_c2h_target} PRIVATE CUDA::curand ${cub_target})
+
+      if (CUB_IN_THRUST)
+        thrust_fix_clang_nvcc_build_for(${config_c2h_target})
+      endif()
+
+      if (CUB_ENABLE_TESTS_WITH_RDC)
+        cub_enable_rdc_for_cuda_target(${config_c2h_target})
+      endif()
+    endif() # config_c2h_target
+
+    if (CUB_SEPARATE_CATCH2)
+      add_executable(${test_target} "${test_src}")
+      target_compile_definitions(${test_target} PRIVATE "CUB_CONFIG_MAIN")
+      add_dependencies(${config_meta_target} ${test_target})
+
+      add_test(NAME ${test_target} COMMAND "$<TARGET_FILE:${test_target}>")
+    else() # Not CUB_SEPARATE_CATCH2
+      # Per config catch2 runner
+      set(config_c2run_target ${config_prefix}.catch2_test)
+      if (NOT TARGET ${config_c2run_target})
+        add_executable(${config_c2run_target} catch2_runner.cu)
+        target_link_libraries(${config_c2run_target} PRIVATE
+          ${cub_target}
+          ${config_c2h_target}
+          Metal
+          Catch2::Catch2
+        )
+        cub_clone_target_properties(${config_c2run_target} ${cub_target})
+        add_dependencies(${config_meta_target} ${config_c2run_target})
+        target_include_directories(${config_c2run_target} PRIVATE
+          "${CUB_SOURCE_DIR}/test"
+        )
+        if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+          target_link_options(${config_c2run_target} PRIVATE "-cuda")
+        endif()
+
+        if (CUB_IN_THRUST)
+          thrust_fix_clang_nvcc_build_for(${config_c2run_target})
+        endif()
+
+        add_test(NAME ${config_c2run_target}
+          COMMAND "$<TARGET_FILE:${config_c2run_target}>"
+        )
+      endif() # per config catch2 runner
+
+      add_library(${test_target} OBJECT "${test_src}")
+      target_link_libraries(${config_c2run_target} PRIVATE ${test_target})
+    endif() # CUB_SEPARATE_CATCH2
+
+    if (CUB_IN_THRUST)
+      thrust_fix_clang_nvcc_build_for(${test_target})
+    endif()
+
+    target_link_libraries(${test_target} PRIVATE
+      ${cub_target}
+      ${config_c2h_target}
+      Metal
+      Catch2::Catch2
+    )
+    cub_clone_target_properties(${test_target} ${cub_target})
+    target_include_directories(${test_target}
+      PUBLIC "${CUB_SOURCE_DIR}/test"
+    )
+  else() # Not catch2:
+    # Related target names:
+    set(test_meta_target cub.all.test.${test_name})
+
+    add_executable(${test_target} "${test_src}")
+    target_link_libraries(${test_target} ${cub_target})
+    cub_clone_target_properties(${test_target} ${cub_target})
+    target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test")
+    target_compile_definitions(${test_target} PRIVATE CUB_DEBUG_HOST_ASSERTIONS)
+
+    if (CUB_IN_THRUST)
+      thrust_fix_clang_nvcc_build_for(${test_target})
+    endif()
+
+    # Add to the active configuration's meta target
+    add_dependencies(${config_meta_target} ${test_target})
+
+    # Meta target that builds tests with this name for all configurations:
+    if (NOT TARGET ${test_meta_target})
+      add_custom_target(${test_meta_target})
+    endif()
+    add_dependencies(${test_meta_target} ${test_target})
+
+    add_test(NAME ${test_target} COMMAND "$<TARGET_FILE:${test_target}>")
+  endif() # Not catch2 test
+endfunction()
+
+# Sets out_var to 1 if the label contains cdp variants, regardless of whether
+# or not CDP is enabled in this particular variant.
+function(_cub_has_cdp_variant out_var label)
+  string(FIND "${label}" "cdp_" idx)
+  if (idx EQUAL -1)
+    set(${out_var} 0 PARENT_SCOPE)
+  else()
+    set(${out_var} 1 PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Sets out_var to 1 if the label contains "cdp_1", e.g. cdp is explicitly
+# requested for this variant.
+function(_cub_is_cdp_enabled_variant out_var label)
+  string(FIND "${label}" "cdp_1" idx)
+  if (idx EQUAL -1)
+    set(${out_var} 0 PARENT_SCOPE)
+  else()
+    set(${out_var} 1 PARENT_SCOPE)
+  endif()
+endfunction()
+
+foreach (test_src IN LISTS test_srcs)
+  get_filename_component(test_name "${test_src}" NAME_WE)
+  string(REGEX REPLACE "^catch2_test_" "" test_name "${test_name}")
+  string(REGEX REPLACE "^test_" "" test_name "${test_name}")
+
+  cub_get_test_params("${test_src}" variant_labels variant_defs)
+  list(LENGTH variant_labels num_variants)
+
+  # Subtract 1 to support the inclusive endpoint of foreach(...RANGE...):
+  math(EXPR range_end "${num_variants} - 1")
+
+  # Verbose output:
+  if (num_variants GREATER 0)
+    message(VERBOSE "Detected ${num_variants} variants of test '${test_src}':")
+    foreach(var_idx RANGE ${range_end})
+      math(EXPR i "${var_idx} + 1")
+      list(GET variant_labels ${var_idx} label)
+      list(GET variant_defs ${var_idx} defs)
+      message(VERBOSE "  ${i}: ${test_name} ${label} ${defs}")
+    endforeach()
+  endif()
+
+  foreach(cub_target IN LISTS CUB_TARGETS)
+    cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+    if (num_variants EQUAL 0)
+      # Only one version of this test.
+      cub_add_test(test_target ${test_name} "${test_src}" ${cub_target})
+      if (CUB_ENABLE_TESTS_WITH_RDC)
+        cub_enable_rdc_for_cuda_target(${test_target})
+      endif()
+    else() # has variants:
+      # Meta target to build all parametrizations of the current test for the
+      # current CUB_TARGET config
+      set(variant_meta_target ${config_prefix}.test.${test_name}.all)
+      if (NOT TARGET ${variant_meta_target})
+        add_custom_target(${variant_meta_target})
+      endif()
+
+      # Meta target to build all parametrizations of the current test for all
+      # CUB_TARGET configs
+      set(cub_variant_meta_target cub.all.test.${test_name}.all)
+      if (NOT TARGET ${cub_variant_meta_target})
+        add_custom_target(${cub_variant_meta_target})
+      endif()
+
+      # Generate multiple tests, one per variant.
+      # See `cub_get_test_params` for details.
+      foreach(var_idx RANGE ${range_end})
+        list(GET variant_labels ${var_idx} label)
+        list(GET variant_defs ${var_idx} defs)
+        string(REPLACE ":" ";" defs "${defs}")
+        # A unique index per variant:
+        list(APPEND defs VAR_IDX=${var_idx})
+
+        # Check if the test has explicit CDP variants:
+        _cub_has_cdp_variant(explicit_cdp "${label}")
+        _cub_is_cdp_enabled_variant(enable_cdp "${label}")
+
+        if (enable_cdp)
+          if (NOT CUB_ENABLE_TESTS_WITH_RDC)
+            continue()
+          endif()
+        endif()
+
+        cub_add_test(test_target
+          ${test_name}.${label}
+          "${test_src}"
+          ${cub_target}
+        )
+        add_dependencies(${variant_meta_target} ${test_target})
+        add_dependencies(${cub_variant_meta_target} ${test_target})
+        target_compile_definitions(${test_target} PRIVATE ${defs})
+
+        # Enable RDC if the test either:
+        # 1. Explicitly requests it (cdp_1 label)
+        # 2. Does not have an explicit CDP variant (no cdp_0 or cdp_1) but
+        #    RDC testing is globally enabled.
+        #
+        # Tests that explicitly request no cdp (cdp_0 label) should never enable
+        # RDC.
+        if (enable_cdp OR ((NOT explicit_cdp) AND CUB_ENABLE_TESTS_WITH_RDC))
+          cub_enable_rdc_for_cuda_target(${test_target})
+        endif()
+      endforeach() # Variant
+    endif() # Has variants
+  endforeach() # CUB targets
+endforeach() # Source file
+
+add_subdirectory(cmake)
diff --git a/include/cub/test/README.md b/include/cub/test/README.md
new file mode 100644
index 0000000..81891f4
--- /dev/null
+++ b/include/cub/test/README.md
@@ -0,0 +1,125 @@
+# Test Parametrization
+
+Some of CUB's tests are very slow to build and are capable of exhausting RAM
+during compilation/linking. To avoid such issues, large tests are split into
+multiple executables to take advantage of parallel computation and reduce memory
+usage.
+
+CUB facilitates this by checking for special `%PARAM%` comments in each test's
+source code, and then uses this information to generate multiple executables
+with different configurations.
+
+## Using `%PARAM%`
+
+The `%PARAM%` hint provides an automated method of generating multiple test
+executables from a single source file. To use it, add one or more special
+comments to the test source file:
+
+```cpp
+// %PARAM% [definition] [label] [values]
+```
+
+CMake will parse the source file and extract these comments, using them to
+generate multiple test executables for the full cartesian product of values.
+
+- `definition` will be used as a preprocessor definition name. By convention,
+  these begin with `TEST_`.
+- `label` is a short, human-readable label that will be used in the test
+  executable's name to identify the test variant.
+- `values` is a colon-separated list of values used during test generation. Only
+  numeric values have been tested.
+
+## Special Labels
+
+### CDP / RDC Testing
+
+If a `label` is `cdp`, it is assumed that the parameter is used to explicitly
+test variants built with and without CDP support. The `values` for such a
+parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1`
+indicating CDP enabled (RDC on).
+
+Tests that do not contain a variant labeled `cdp` will only enable RDC if
+the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true.
+
+## Example
+
+For example, if `test_baz.cu` contains the following lines:
+
+```cpp
+// %PARAM% TEST_FOO foo 0:1:2
+// %PARAM% TEST_CDP cdp 0:1
+```
+
+Six executables and CTest targets will be generated with unique definitions
+(only c++17 targets shown):
+
+| Executable Name                  | Preprocessor Definitions    | RDC State |
+|----------------------------------|-----------------------------|-----------|
+| `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled  |
+| `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled   |
+| `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled  |
+| `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled   |
+| `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled  |
+| `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled   |
+
+## Changing `%PARAM%` Hints
+
+Since CMake does not automatically reconfigure the build when source files are
+modified, CMake will need to be rerun manually whenever the `%PARAM%` comments
+change.
+
+## Building and Running Split Tests
+
+CMake will generate individual build and test targets for each test variant, and
+also provides build "metatargets" that compile all variants of a given test.
+
+The variants follow the usual naming convention for CUB's tests, but include a
+suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above).
+
+### Individual Test Variants
+
+Continuing with the `test_baz.cu` example, the test variant that uses
+`-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone:
+
+```bash
+# Build a single variant:
+make cub.cpp17.test.baz.foo_1.bar_4
+
+# Run a single variant
+bin/cub.cpp17.test.baz.foo_1.bar_4
+
+# Run a single variant using CTest regex:
+ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4
+```
+
+### All Variants of a Test
+
+Using a metatarget and the proper regex, all variants of a test can be built and
+executed without listing all variants explicitly:
+
+```bash
+# Build all variants using the `.all` metatarget
+make cub.cpp17.test.baz.all
+
+# Run all variants:
+ctest -R cub\.cpp17\.test\.baz\.
+```
+
+## Debugging
+
+Running CMake with `--log-level=VERBOSE` will print out extra information about
+all detected test variants.
+
+## Additional Info
+
+Ideally, only parameters that directly influence kernel template instantiations
+should be split out in this way. If changing a parameter doesn't change the
+kernel template type, the same kernel will be compiled into multiple
+executables. This defeats the purpose of splitting up the test since the
+compiler will generate redundant code across the new split executables.
+
+The best candidate parameters for splitting are input value types, rather than
+integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more
+infrastructure (data generation, validation) to be reused. Splitting other
+parameters can cause build times to increase since type-related infrastructure
+has to be rebuilt for each test variant.
diff --git a/include/cub/test/bfloat16.h b/include/cub/test/bfloat16.h
new file mode 100644
index 0000000..aa9474d
--- /dev/null
+++ b/include/cub/test/bfloat16.h
@@ -0,0 +1,249 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * Utilities for interacting with the opaque CUDA __nv_bfloat16 type
+ */
+
+#include <stdint.h>
+#include <cuda_bf16.h>
+#include <iosfwd>
+
+#include <cub/util_type.cuh>
+
+#ifdef __GNUC__
+// There's a ton of type-punning going on in this file.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+
+/******************************************************************************
+ * bfloat16_t
+ ******************************************************************************/
+
+/**
+ * Host-based fp16 data type compatible and convertible with __nv_bfloat16
+ */
+struct bfloat16_t
+{
+    uint16_t __x;
+
+    /// Constructor from __nv_bfloat16
+    __host__ __device__ __forceinline__
+    bfloat16_t(const __nv_bfloat16 &other)
+    {
+        __x = reinterpret_cast<const uint16_t&>(other);
+    }
+
+    /// Constructor from integer
+    __host__ __device__ __forceinline__
+    bfloat16_t(int a)
+    {
+        *this = bfloat16_t(float(a));
+    }
+
+    /// Constructor from std::size_t
+    __host__ __device__ __forceinline__
+    bfloat16_t(std::size_t a)
+    {
+        *this = bfloat16_t(float(a));
+    }
+
+    /// Default constructor
+    bfloat16_t() = default;
+
+    /// Constructor from float
+    __host__ __device__ __forceinline__
+    bfloat16_t(float a)
+    {
+        // Refrence:
+        // https://github.com/pytorch/pytorch/blob/44cc873fba5e5ffc4d4d4eef3bd370b653ce1ce1/c10/util/BFloat16.h#L51
+        uint16_t ir;
+        if (a != a) {
+            ir = UINT16_C(0x7FFF);
+        } else {
+            union {
+                uint32_t U32;
+                float F32;
+            };
+
+            F32 = a;
+            uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+            ir = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+        }
+        this->__x = ir;
+    }
+
+    /// Cast to __nv_bfloat16
+    __host__ __device__ __forceinline__
+    operator __nv_bfloat16() const
+    {
+        return reinterpret_cast<const __nv_bfloat16&>(__x);
+    }
+
+    /// Cast to float
+    __host__ __device__ __forceinline__
+    operator float() const
+    {
+        float f = 0;
+        uint32_t *p = reinterpret_cast<uint32_t *>(&f);
+        *p = uint32_t(__x) << 16;
+        return f;
+    }
+
+
+    /// Get raw storage
+    __host__ __device__ __forceinline__
+    uint16_t raw() const
+    {
+        return this->__x;
+    }
+
+    /// Equality
+    __host__ __device__ __forceinline__
+    bool operator ==(const bfloat16_t &other) const
+    {
+        return (this->__x == other.__x);
+    }
+
+    /// Inequality
+    __host__ __device__ __forceinline__
+    bool operator !=(const bfloat16_t &other) const
+    {
+        return (this->__x != other.__x);
+    }
+
+    /// Assignment by sum
+    __host__ __device__ __forceinline__
+    bfloat16_t& operator +=(const bfloat16_t &rhs)
+    {
+        *this = bfloat16_t(float(*this) + float(rhs));
+        return *this;
+    }
+
+    /// Multiply
+    __host__ __device__ __forceinline__
+    bfloat16_t operator*(const bfloat16_t &other)
+    {
+        return bfloat16_t(float(*this) * float(other));
+    }
+
+    /// Add
+    __host__ __device__ __forceinline__
+    bfloat16_t operator+(const bfloat16_t &other)
+    {
+        return bfloat16_t(float(*this) + float(other));
+    }
+
+    /// Less-than
+    __host__ __device__ __forceinline__
+    bool operator<(const bfloat16_t &other) const
+    {
+        return float(*this) < float(other);
+    }
+
+    /// Less-than-equal
+    __host__ __device__ __forceinline__
+    bool operator<=(const bfloat16_t &other) const
+    {
+        return float(*this) <= float(other);
+    }
+
+    /// Greater-than
+    __host__ __device__ __forceinline__
+    bool operator>(const bfloat16_t &other) const
+    {
+        return float(*this) > float(other);
+    }
+
+    /// Greater-than-equal
+    __host__ __device__ __forceinline__
+    bool operator>=(const bfloat16_t &other) const
+    {
+        return float(*this) >= float(other);
+    }
+
+    /// numeric_traits<bfloat16_t>::max
+    __host__ __device__ __forceinline__
+    static bfloat16_t (max)() {
+        uint16_t max_word = 0x7F7F;
+        return reinterpret_cast<bfloat16_t&>(max_word);
+    }
+
+    /// numeric_traits<bfloat16_t>::lowest
+    __host__ __device__ __forceinline__
+    static bfloat16_t lowest() {
+        uint16_t lowest_word = 0xFF7F;
+        return reinterpret_cast<bfloat16_t&>(lowest_word);
+    }
+};
+
+
+/******************************************************************************
+ * I/O stream overloads
+ ******************************************************************************/
+
+/// Insert formatted \p bfloat16_t into the output stream
+std::ostream& operator<<(std::ostream &out, const bfloat16_t &x)
+{
+    out << (float)x;
+    return out;
+}
+
+
+/// Insert formatted \p __nv_bfloat16 into the output stream
+std::ostream& operator<<(std::ostream &out, const __nv_bfloat16 &x)
+{
+    return out << bfloat16_t(x);
+}
+
+
+/******************************************************************************
+ * Traits overloads
+ ******************************************************************************/
+
+template <>
+struct CUB_NS_QUALIFIER::FpLimits<bfloat16_t>
+{
+    static __host__ __device__ __forceinline__ bfloat16_t Max() { return bfloat16_t::max(); }
+
+    static __host__ __device__ __forceinline__ bfloat16_t Lowest() { return bfloat16_t::lowest(); }
+};
+
+template <>
+struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
+    : CUB_NS_QUALIFIER::
+        BaseTraits<FLOATING_POINT, true, false, unsigned short, bfloat16_t>
+{};
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
diff --git a/include/cub/test/c2h/custom_type.cuh b/include/cub/test/c2h/custom_type.cuh
new file mode 100644
index 0000000..b805133
--- /dev/null
+++ b/include/cub/test/c2h/custom_type.cuh
@@ -0,0 +1,200 @@
+/******************************************************************************
+* Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <ostream>
+
+#include <thrust/device_vector.h>
+
+namespace c2h
+{
+
+struct custom_type_state_t
+{
+  std::size_t key{};
+  std::size_t val{};
+};
+
+template <template<typename> class... Policies>
+class custom_type_t : public custom_type_state_t
+                    , public Policies<custom_type_t<Policies...>>...
+{
+
+public:
+  friend __host__ std::ostream &operator<<(std::ostream &os, 
+                                           const custom_type_t &self) 
+  { 
+    return os << "{ " << self.key << ", " << self.val << " }";
+  }
+
+};
+
+template <class CustomType>
+class less_comparable_t
+{
+  // The CUDA compiler follows the IA64 ABI for class layout, while the 
+  // Microsoft host compiler does not.
+  char workaround_msvc;
+
+public:
+  __host__ __device__ bool operator<(const CustomType& other) const
+  {
+    return static_cast<const CustomType&>(*this).key < other.key;
+  }
+};
+
+template <class CustomType>
+class greater_comparable_t
+{
+  // The CUDA compiler follows the IA64 ABI for class layout, while the 
+  // Microsoft host compiler does not.
+  char workaround_msvc;
+
+public:
+  __host__ __device__ bool operator>(const CustomType& other) const
+  {
+    return static_cast<const CustomType&>(*this).key > other.key;
+  }
+};
+
+template <class CustomType>
+class lexicographical_less_comparable_t
+{
+  // The CUDA compiler follows the IA64 ABI for class layout, while the 
+  // Microsoft host compiler does not.
+  char workaround_msvc;
+
+public:
+  __host__ __device__ bool operator<(const CustomType &other) const
+  {
+    const CustomType &self = static_cast<const CustomType &>(*this);
+    return self.key == other.key ? self.val < other.val : self.key < other.key;
+  }
+};
+
+template <class CustomType>
+class lexicographical_greater_comparable_t
+{
+  // The CUDA compiler follows the IA64 ABI for class layout, while the 
+  // Microsoft host compiler does not.
+  char workaround_msvc;
+
+public:
+  __host__ __device__ bool operator>(const CustomType &other) const
+  {
+    const CustomType &self = static_cast<const CustomType &>(*this);
+    return self.key == other.key ? self.val > other.val : self.key > other.key;
+  }
+};
+
+template <class CustomType>
+class equal_comparable_t
+{
+  // The CUDA compiler follows the IA64 ABI for class layout, while the 
+  // Microsoft host compiler does not.
+  char workaround_msvc;
+
+public:
+  __host__ __device__ bool operator==(const CustomType& other) const
+  {
+    const CustomType& self = static_cast<const CustomType&>(*this);
+    
+    return self.key == other.key &&
+           self.val == other.val;
+  }
+};
+
+template <class CustomType>
+class subtractable_t
+{
+  // The CUDA compiler follows the IA64 ABI for class layout, while the 
+  // Microsoft host compiler does not.
+  char workaround_msvc;
+
+public:
+  __host__ __device__ CustomType operator-(const CustomType& other) const
+  {
+    CustomType result{};
+
+    const CustomType& self = static_cast<const CustomType&>(*this);
+
+    result.key = self.key - other.key;
+    result.val = self.val - other.val;
+    
+    return result;
+  }
+};
+
+template <class CustomType>
+class accumulateable_t
+{
+  // The CUDA compiler follows the IA64 ABI for class layout, while the 
+  // Microsoft host compiler does not.
+  char workaround_msvc;
+
+public:
+  __host__ __device__ CustomType operator+(const CustomType& other) const
+  {
+    CustomType result{};
+
+    const CustomType& self = static_cast<const CustomType&>(*this);
+
+    result.key = self.key + other.key;
+    result.val = self.val + other.val;
+    
+    return result;
+  }
+};
+
+} // c2h
+
+namespace std {
+  template<template<typename> class... Policies> 
+  class numeric_limits<c2h::custom_type_t<Policies...>> 
+  {
+  public:
+     static c2h::custom_type_t<Policies...> max() 
+     {
+       c2h::custom_type_t<Policies...> val;
+       val.key = std::numeric_limits<std::size_t>::max();
+       val.val = std::numeric_limits<std::size_t>::max();
+       return val;
+     }
+
+     static c2h::custom_type_t<Policies...> lowest() 
+     {
+       c2h::custom_type_t<Policies...> val;
+       val.key = std::numeric_limits<std::size_t>::lowest();
+       val.val = std::numeric_limits<std::size_t>::lowest();
+       return val;
+     }
+  };
+}
+
diff --git a/include/cub/test/c2h/generators.cu b/include/cub/test/c2h/generators.cu
new file mode 100644
index 0000000..0c21056
--- /dev/null
+++ b/include/cub/test/c2h/generators.cu
@@ -0,0 +1,417 @@
+/******************************************************************************
+* Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#define C2H_EXPORTS
+
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/tabulate.h>
+
+#include <cstdint>
+
+#include <c2h/custom_type.cuh>
+#include <c2h/generators.cuh>
+#include <curand.h>
+#include <fill_striped.cuh>
+
+namespace c2h
+{
+
+class generator_t
+{
+private:
+  generator_t();
+
+public:
+
+  static generator_t &instance();
+  ~generator_t();
+
+  template <typename T>
+  void operator()(seed_t seed,
+                  thrust::device_vector<T> &data,
+                  T min = std::numeric_limits<T>::min(),
+                  T max = std::numeric_limits<T>::max());
+
+  template <typename T>
+  void operator()(modulo_t modulo,
+                  thrust::device_vector<T> &data);
+
+  float* distribution();
+  curandGenerator_t &gen() { return m_gen; }
+
+  float* prepare_random_generator(
+      seed_t seed,
+      std::size_t num_items);
+
+private:
+  curandGenerator_t m_gen;
+  thrust::device_vector<float> m_distribution;
+};
+
+
+template <typename T>
+struct random_to_item_t
+{
+  float m_min;
+  float m_max;
+
+  __host__ __device__ random_to_item_t(T min, T max)
+      : m_min(static_cast<float>(min))
+      , m_max(static_cast<float>(max))
+  {}
+
+  __device__ T operator()(float random_value)
+  {
+    return static_cast<T>((m_max - m_min) * random_value + m_min);
+  }
+};
+
+template <typename T, int VecItem>
+struct random_to_vec_item_t;
+
+#define RANDOM_TO_VEC_ITEM_SPEC(VEC_ITEM, VEC_FIELD)                           \
+  template <typename T>                                                        \
+  struct random_to_vec_item_t<T, VEC_ITEM>                                     \
+  {                                                                            \
+    __device__ void operator()(std::size_t idx)                                \
+    {                                                                          \
+      auto min             = m_min.VEC_FIELD;                                  \
+      auto max             = m_max.VEC_FIELD;                                  \
+      m_out[idx].VEC_FIELD = random_to_item_t<decltype(min)>(min,              \
+                                                             max)(m_in[idx]);  \
+    }                                                                          \
+    random_to_vec_item_t(T min, T max, float *in, T *out)                      \
+        : m_min(min)                                                           \
+        , m_max(max)                                                           \
+        , m_in(in)                                                             \
+        , m_out(out)                                                           \
+    {}                                                                         \
+    T m_min;                                                                   \
+    T m_max;                                                                   \
+    float *m_in{};                                                             \
+    T *m_out{};                                                                \
+  }
+
+RANDOM_TO_VEC_ITEM_SPEC(0, x);
+RANDOM_TO_VEC_ITEM_SPEC(1, y);
+RANDOM_TO_VEC_ITEM_SPEC(2, z);
+RANDOM_TO_VEC_ITEM_SPEC(3, w);
+
+generator_t::generator_t()
+{
+  curandCreateGenerator(&m_gen, CURAND_RNG_PSEUDO_DEFAULT);
+}
+
+generator_t::~generator_t()
+{
+  curandDestroyGenerator(m_gen);
+}
+
+float* generator_t::distribution()
+{
+  return thrust::raw_pointer_cast(m_distribution.data());
+}
+
+float *generator_t::prepare_random_generator(seed_t seed, 
+                                             std::size_t num_items)
+{
+  curandSetPseudoRandomGeneratorSeed(m_gen, seed.get());
+
+  m_distribution.resize(num_items);
+  curandGenerateUniform(m_gen,
+                        this->distribution(),
+                        num_items);
+
+  return this->distribution();
+}
+
+template <bool SetKeys> 
+struct random_to_custom_t
+{
+  static constexpr std::size_t m_max_key = 
+    std::numeric_limits<std::size_t>::max();
+
+  __device__ void operator()(std::size_t idx)
+  {
+    std::size_t in = 
+      static_cast<std::size_t>(static_cast<float>(m_max_key) * m_in[idx]);
+
+    custom_type_state_t* out = 
+      reinterpret_cast<custom_type_state_t*>(m_out + idx * m_element_size);
+
+    if (SetKeys)
+    {
+      out->key = in;
+    }
+    else 
+    {
+      out->val = in;
+    }
+  }
+
+  random_to_custom_t(
+      float *in, 
+      char *out,
+      std::size_t element_size)
+    : m_in(in)
+    , m_out(out)
+    , m_element_size(element_size)
+  {}
+
+  float *m_in{};
+  char *m_out{};
+  std::size_t m_element_size{};
+};
+
+template <class T>
+void generator_t::operator()(seed_t seed,
+                             thrust::device_vector<T> &data,
+                             T min,
+                             T max)
+{
+  prepare_random_generator(seed, data.size());
+
+  thrust::transform(m_distribution.begin(),
+                    m_distribution.end(),
+                    data.begin(),
+                    random_to_item_t<T>(min, max));
+}
+
+template <typename T>
+struct count_to_item_t
+{
+  std::size_t n;
+
+  count_to_item_t(std::size_t n)
+    : n(n)
+  {}
+
+  template <typename CounterT>
+  __device__ T operator()(CounterT id)
+  {
+    return static_cast<T>(static_cast<std::size_t>(id) % n);
+  }
+};
+
+template <typename T>
+void generator_t::operator()(modulo_t mod,
+                             thrust::device_vector<T> &data)
+{
+  thrust::tabulate(data.begin(), data.end(), count_to_item_t<T>{mod.get()});
+}
+
+
+generator_t& generator_t::instance()
+{
+  static generator_t generator;
+  return generator;
+}
+
+namespace detail
+{
+
+void gen(seed_t seed,
+         char* d_out,
+         custom_type_state_t /* min */, 
+         custom_type_state_t /* max */,
+         std::size_t elements,
+         std::size_t element_size)
+{
+  thrust::counting_iterator<std::size_t> cnt_begin(0);
+  thrust::counting_iterator<std::size_t> cnt_end(elements);
+
+  generator_t& generator = generator_t::instance();
+  float *d_in = generator.prepare_random_generator(seed, elements);
+
+  thrust::for_each(
+    thrust::device,
+    cnt_begin,
+    cnt_end,
+    random_to_custom_t<true>{d_in, d_out, element_size});
+
+  curandGenerateUniform(generator.gen(),
+                        generator.distribution(),
+                        elements);
+
+  thrust::for_each(
+    thrust::device,
+    cnt_begin,
+    cnt_end,
+    random_to_custom_t<false>{d_in, d_out, element_size});
+}
+
+}
+
+
+template <typename T>
+void gen(seed_t seed, 
+         thrust::device_vector<T> &data,
+         T min,
+         T max)
+{
+  generator_t::instance()(seed, data, min, max);
+}
+
+template <typename T>
+void gen(modulo_t mod, 
+         thrust::device_vector<T> &data)
+{
+  generator_t::instance()(mod, data);
+}
+
+#define INSTANTIATE_RND(TYPE) \
+template \
+void gen<TYPE>( \
+    seed_t, \
+    thrust::device_vector<TYPE> &data, \
+    TYPE min, \
+    TYPE max)
+
+#define INSTANTIATE_MOD(TYPE) \
+template \
+void gen<TYPE>( \
+    modulo_t, \
+    thrust::device_vector<TYPE> &data)
+
+#define INSTANTIATE(TYPE) \
+  INSTANTIATE_RND(TYPE); \
+  INSTANTIATE_MOD(TYPE)
+
+INSTANTIATE(std::uint8_t);
+INSTANTIATE(std::uint16_t);
+INSTANTIATE(std::uint32_t);
+INSTANTIATE(std::uint64_t);
+
+INSTANTIATE(std::int8_t);
+INSTANTIATE(std::int16_t);
+INSTANTIATE(std::int32_t);
+INSTANTIATE(std::int64_t);
+
+INSTANTIATE(float);
+INSTANTIATE(double);
+
+template <typename T, int VecItem>
+struct vec_gen_helper_t;
+
+template <typename T>
+struct vec_gen_helper_t<T, -1>
+{
+  static void gen(thrust::device_vector<T> &, T , T )
+  {
+  }
+};
+
+template <typename T, int VecItem>
+struct vec_gen_helper_t
+{
+  static void gen(thrust::device_vector<T> &data,
+                  T min,
+                  T max)
+  {
+    thrust::counting_iterator<std::size_t> cnt_begin(0);
+    thrust::counting_iterator<std::size_t> cnt_end(data.size());
+
+    generator_t& generator = generator_t::instance();
+    float *d_in = generator.distribution();
+    T *d_out = thrust::raw_pointer_cast(data.data());
+
+    curandGenerateUniform(generator.gen(), d_in, data.size());
+
+    thrust::for_each(
+      thrust::device,
+      cnt_begin,
+      cnt_end,
+      random_to_vec_item_t<T, VecItem>{min, max, d_in, d_out});
+
+    vec_gen_helper_t<T, VecItem - 1>::gen(data, min, max);
+  }
+};
+
+
+#define VEC_SPECIALIZATION(TYPE, SIZE) \
+template<> void gen<TYPE##SIZE>(seed_t seed, \
+                                thrust::device_vector<TYPE##SIZE> &data, \
+                                TYPE##SIZE min, \
+                                TYPE##SIZE max) \
+{ \
+  generator_t& generator = generator_t::instance(); \
+  generator.prepare_random_generator(seed, data.size()); \
+  vec_gen_helper_t<TYPE##SIZE, SIZE - 1>::gen(data, min, max); \
+}
+
+VEC_SPECIALIZATION(int, 2);
+VEC_SPECIALIZATION(long, 2);
+VEC_SPECIALIZATION(longlong, 2);
+VEC_SPECIALIZATION(longlong, 4);
+
+VEC_SPECIALIZATION(char, 2);
+VEC_SPECIALIZATION(char, 4);
+
+VEC_SPECIALIZATION(short, 2);
+
+VEC_SPECIALIZATION(double, 2);
+
+VEC_SPECIALIZATION(uchar, 3);
+
+VEC_SPECIALIZATION(ulonglong, 4);
+
+template <typename VecType, typename Type>
+struct vec_gen_t
+{
+  std::size_t n;
+  scalar_to_vec_t<VecType> convert;
+
+  vec_gen_t(std::size_t n)
+      : n(n)
+  {}
+
+  template <typename CounterT>
+  __device__ VecType operator()(CounterT id)
+  {
+    return convert(static_cast<Type>(id) % n);
+  }
+};
+
+#define VEC_GEN_MOD_SPECIALIZATION(VEC_TYPE, SCALAR_TYPE)                                          \
+  template <>                                                                                      \
+  void gen<VEC_TYPE>(modulo_t mod, thrust::device_vector<VEC_TYPE> & data)                         \
+  {                                                                                                \
+    thrust::tabulate(data.begin(), data.end(), vec_gen_t<VEC_TYPE, SCALAR_TYPE>{mod.get()});       \
+  }
+
+VEC_GEN_MOD_SPECIALIZATION(short2, short);
+
+VEC_GEN_MOD_SPECIALIZATION(uchar3, unsigned char);
+
+VEC_GEN_MOD_SPECIALIZATION(ulonglong4, unsigned long long);
+
+VEC_GEN_MOD_SPECIALIZATION(ushort4, unsigned short);
+
+} // c2h
+
diff --git a/include/cub/test/c2h/generators.cuh b/include/cub/test/c2h/generators.cuh
new file mode 100644
index 0000000..305af50
--- /dev/null
+++ b/include/cub/test/c2h/generators.cuh
@@ -0,0 +1,103 @@
+/******************************************************************************
+* Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#pragma once
+
+#include <limits>
+
+#include <thrust/device_vector.h>
+
+#include <c2h/custom_type.cuh>
+
+namespace c2h
+{
+
+namespace detail
+{
+
+template <class T>
+class value_wrapper_t
+{
+  T m_val{};
+
+public:
+  explicit value_wrapper_t(T val) : m_val(val) {}
+  explicit value_wrapper_t(int val) : m_val(static_cast<T>(val)) {}
+  T get() const { return m_val; }
+};
+
+}
+
+class seed_t : public detail::value_wrapper_t<unsigned long long int> 
+{
+  using value_wrapper_t::value_wrapper_t;
+};
+
+class modulo_t : public detail::value_wrapper_t<std::size_t> 
+{
+  using value_wrapper_t::value_wrapper_t;
+};
+
+namespace detail
+{
+  
+void gen(seed_t seed,
+         char* data,
+         c2h::custom_type_state_t min,
+         c2h::custom_type_state_t max,
+         std::size_t elements,
+         std::size_t element_size);
+
+}
+
+template <template <typename> class... Ps>
+void gen(
+  seed_t seed,
+  thrust::device_vector<c2h::custom_type_t<Ps...>> &data,
+  c2h::custom_type_t<Ps...> min = std::numeric_limits<c2h::custom_type_t<Ps...>>::lowest(),
+  c2h::custom_type_t<Ps...> max = std::numeric_limits<c2h::custom_type_t<Ps...>>::max())
+{
+  detail::gen(
+      seed, 
+      reinterpret_cast<char*>(thrust::raw_pointer_cast(data.data())),
+      min,
+      max,
+      data.size(),
+      sizeof(c2h::custom_type_t<Ps...>));
+}
+
+template <typename T>
+void gen(seed_t seed,
+         thrust::device_vector<T> &data,
+         T min = std::numeric_limits<T>::min(),
+         T max = std::numeric_limits<T>::max());
+
+template <typename T>
+void gen(modulo_t mod, thrust::device_vector<T> &data);
+
+} // c2h
+
diff --git a/include/cub/test/catch2_runner.cu b/include/cub/test/catch2_runner.cu
new file mode 100644
index 0000000..f3bb503
--- /dev/null
+++ b/include/cub/test/catch2_runner.cu
@@ -0,0 +1,3 @@
+#define CUB_CONFIG_MAIN
+#include "catch2_test_helper.h"
+
diff --git a/include/cub/test/catch2_test_block_adjacent_difference.cu b/include/cub/test/catch2_test_block_adjacent_difference.cu
new file mode 100644
index 0000000..af5a0be
--- /dev/null
+++ b/include/cub/test/catch2_test_block_adjacent_difference.cu
@@ -0,0 +1,425 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_adjacent_difference.cuh>
+
+#include <thrust/host_vector.h>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <int ThreadsInBlock,
+          int ItemsPerThread,
+          class T,
+          class ActionT>
+__global__ void block_adj_diff_kernel(T *data, ActionT action, bool in_place)
+{
+  using block_adjacent_differencet_t =
+    cub::BlockAdjacentDifference<T, ThreadsInBlock>;
+  using temp_storage_t = typename block_adjacent_differencet_t::TempStorage;
+
+  __shared__ temp_storage_t temp_storage;
+
+  T thread_in[ItemsPerThread];
+  T thread_out[ItemsPerThread];
+
+  const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_in[item] = data[thread_offset + item];
+  }
+  __syncthreads();
+
+  block_adjacent_differencet_t adj_diff(temp_storage);
+
+  if (in_place)
+  {
+    action(adj_diff, thread_in, thread_in);
+
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+      data[thread_offset + item] = thread_in[item];
+    }
+  }
+  else
+  {
+    action(adj_diff, thread_in, thread_out);
+
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+      data[thread_offset + item] = thread_out[item];
+    }
+  }
+}
+
+template <class T>
+struct custom_difference_t
+{
+  __host__ __device__ T operator()(const T &lhs, const T &rhs)
+  {
+    return lhs - rhs;
+  }
+};
+
+template <bool ReadLeft>
+struct base_op_t
+{
+  template <int ItemsPerThread, typename T, typename BlockAdjDiff>
+  __device__ void operator()(BlockAdjDiff &block_adj_diff,
+                             T (&input)[ItemsPerThread],
+                             T (&output)[ItemsPerThread]) const
+  {
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeft(input, output, custom_difference_t<T>{});
+    }
+    else
+    {
+      block_adj_diff.SubtractRight(input, output, custom_difference_t<T>{});
+    }
+  }
+};
+
+template <bool ReadLeft>
+struct last_tile_op_t
+{
+  int m_valid_items{};
+
+  __host__ last_tile_op_t(int valid_items)
+    : m_valid_items(valid_items)
+  {}
+
+  template <int ITEMS_PER_THREAD,
+            typename T,
+            typename BlockAdjDiff>
+  __device__ void operator()(BlockAdjDiff &block_adj_diff,
+                             T (&input)[ITEMS_PER_THREAD],
+                             T (&output)[ITEMS_PER_THREAD]) const
+  {
+    custom_difference_t<T> diff{};
+
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeftPartialTile(input,
+                                             output,
+                                             diff,
+                                             m_valid_items);
+    }
+    else
+    {
+      block_adj_diff.SubtractRightPartialTile(input,
+                                              output,
+                                              diff,
+                                              m_valid_items);
+    }
+  }
+};
+
+template <class T, 
+          bool ReadLeft>
+struct middle_tile_op_t
+{
+  T m_neighbour_tile_value;
+
+  __host__ middle_tile_op_t(T neighbour_tile_value)
+    : m_neighbour_tile_value(neighbour_tile_value)
+  {}
+
+  template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
+  __device__ void operator()(BlockAdjDiff &block_adj_diff,
+                             T (&input)[ITEMS_PER_THREAD],
+                             T (&output)[ITEMS_PER_THREAD]) const
+  {
+    custom_difference_t<T> diff{};
+
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeft(input,
+                                  output,
+                                  diff,
+                                  m_neighbour_tile_value);
+    }
+    else
+    {
+      block_adj_diff.SubtractRight(input,
+                                   output,
+                                   diff,
+                                   m_neighbour_tile_value);
+    }
+  }
+};
+
+template <typename T>
+struct last_tile_with_pred_op_t
+{
+  int m_valid_items;
+  T m_neighbour_tile_value;
+
+  __host__ last_tile_with_pred_op_t(
+      int valid_items,
+      T neighbour_tile_value)
+    : m_valid_items(valid_items)
+    , m_neighbour_tile_value(neighbour_tile_value)
+  {
+  }
+
+  template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
+  __device__ void operator()(BlockAdjDiff &block_adj_diff, 
+                             T (&input)[ITEMS_PER_THREAD],
+                             T (&output)[ITEMS_PER_THREAD]) const
+  {
+    custom_difference_t<T> diff{};
+    block_adj_diff.SubtractLeftPartialTile(input,
+                                           output,
+                                           diff,
+                                           m_valid_items,
+                                           m_neighbour_tile_value);
+  }
+};
+
+template <int ItemsPerThread, int ThreadsInBlock, class T, class ActionT>
+void block_adj_diff(thrust::device_vector<T> &data,
+                    bool in_place,
+                    ActionT action)
+{
+  block_adj_diff_kernel<ThreadsInBlock, ItemsPerThread, T, ActionT>
+    <<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(data.data()),
+                            action,
+                            in_place);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <bool ReadLeft, 
+          class T>
+void host_adj_diff(thrust::host_vector<T> &h_data, int valid_items)
+{
+  custom_difference_t<T> diff{};
+
+  if (ReadLeft)
+  {
+    for (int i = valid_items - 1; i > 0; i--)
+    {
+      h_data[i] = diff(h_data[i], h_data[i - 1]);
+    }
+  }
+  else
+  {
+    for (int i = 0; i < valid_items - 1; i++)
+    {
+      h_data[i] = diff(h_data[i], h_data[i + 1]);
+    }
+  }
+}
+
+template <bool ReadLeft, class T>
+void host_adj_diff(thrust::host_vector<T> &h_data,
+                   int valid_items,
+                   T neighbour_value)
+{
+  custom_difference_t<T> diff{};
+
+  host_adj_diff<ReadLeft>(h_data, valid_items);
+
+  if (valid_items == 0)
+  {
+    return;
+  }
+
+  if (ReadLeft)
+  {
+    h_data[0] = diff(h_data[0], neighbour_value);
+  }
+  else
+  {
+    h_data[valid_items - 1] = diff(h_data[valid_items - 1], neighbour_value);
+  }
+}
+
+// %PARAM% THREADS_IN_BLOCK bs 64:256
+
+using key_types =
+  c2h::type_list<std::uint16_t, std::int32_t, std::int64_t>;
+
+using threads_in_block = c2h::enum_type_list<int, THREADS_IN_BLOCK>;
+using items_per_thread = c2h::enum_type_list<int, 1, 2, 10, 15>;
+using directions = c2h::enum_type_list<bool, false, true>;
+using left_only = c2h::enum_type_list<bool, true>;
+
+template <class TestType>
+struct params_t
+{
+  using key_t = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
+  static constexpr int tile_size = items_per_thread * threads_in_block;
+  static constexpr bool read_left = c2h::get<3, TestType>::value;
+};
+
+CUB_TEST("Block adjacent difference works with full tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         directions)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+
+  thrust::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  const bool in_place = GENERATE(false, true);
+
+  thrust::host_vector<key_t> h_data = d_data;
+  host_adj_diff<params::read_left>(h_data, params::tile_size);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(
+    d_data,
+    in_place,
+    base_op_t<params::read_left>{});
+
+  REQUIRE(h_data == d_data);
+}
+
+CUB_TEST("Block adjacent difference works with last tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         directions)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+
+  thrust::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  const bool in_place = GENERATE(false, true);
+  const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size)));
+
+  thrust::host_vector<key_t> h_data = d_data;
+  host_adj_diff<params::read_left>(h_data, valid_items);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(
+    d_data,
+    in_place,
+    last_tile_op_t<params::read_left>{valid_items});
+
+  REQUIRE(h_data == d_data);
+}
+
+CUB_TEST("Block adjacent difference works with single tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         left_only)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+
+  thrust::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  const bool in_place = GENERATE(false, true);
+  const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size)));
+  constexpr bool read_left = true;
+
+  thrust::host_vector<key_t> h_data = d_data;
+  key_t neighbour_value = h_data[h_data.size() / 2];
+
+  host_adj_diff<read_left>(h_data, valid_items, neighbour_value);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(
+    d_data,
+    in_place,
+    last_tile_with_pred_op_t<key_t>{valid_items, neighbour_value});
+
+  REQUIRE(h_data == d_data);
+}
+
+CUB_TEST("Block adjacent difference works with middle tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         directions)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+
+  thrust::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  const bool in_place = GENERATE(false, true);
+
+  thrust::host_vector<key_t> h_data = d_data;
+  key_t neighbour_value = h_data[h_data.size() / 2]; 
+
+  host_adj_diff<params::read_left>(h_data, params::tile_size, neighbour_value);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(
+    d_data,
+    in_place,
+    middle_tile_op_t<key_t, params::read_left>{neighbour_value});
+
+  REQUIRE(h_data == d_data);
+}
+
+CUB_TEST("Block adjacent difference supports custom types",
+         "[adjacent difference][block]",
+         threads_in_block)
+{
+  using key_t = c2h::custom_type_t<c2h::equal_comparable_t,
+                                   c2h::subtractable_t>;
+
+  constexpr int items_per_thread = 2;
+  constexpr int threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int tile_size = threads_in_block * items_per_thread;
+  constexpr bool read_left = true;
+  const bool in_place = true;
+
+  thrust::device_vector<key_t> d_data(tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  thrust::host_vector<key_t> h_data = d_data;
+  host_adj_diff<read_left>(h_data, tile_size);
+
+  block_adj_diff<items_per_thread, threads_in_block>(d_data,
+                                                     in_place,
+                                                     base_op_t<read_left>{});
+
+  REQUIRE(h_data == d_data);
+}
+
+// TODO Test different input/output types
+
diff --git a/include/cub/test/catch2_test_block_histogram.cu b/include/cub/test/catch2_test_block_histogram.cu
new file mode 100644
index 0000000..b9f13a5
--- /dev/null
+++ b/include/cub/test/catch2_test_block_histogram.cu
@@ -0,0 +1,216 @@
+/******************************************************************************
+* Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <limits>
+#include <string>
+
+#include <cub/block/block_histogram.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <int BINS,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          cub::BlockHistogramAlgorithm ALGORITHM,
+          typename T,
+          typename HistoCounter>
+__global__ void block_histogram_kernel(T *d_samples, HistoCounter *d_histogram)
+{
+  // Parameterize BlockHistogram type for our thread block
+  using block_histogram_t =
+    cub::BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM>;
+
+  // Allocate temp storage in shared memory
+  __shared__ typename block_histogram_t::TempStorage temp_storage;
+
+  // Per-thread tile data
+  T data[ITEMS_PER_THREAD];
+  cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
+
+  // Test histo (writing directly to histogram buffer in global)
+  block_histogram_t(temp_storage).Histogram(data, d_histogram);
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          int Bins,
+          cub::BlockHistogramAlgorithm Algorithm,
+          typename SampleT>
+void block_histogram(thrust::device_vector<SampleT> &d_samples,
+                     thrust::device_vector<int> &d_histogram)
+{
+  block_histogram_kernel<Bins, ThreadsInBlock, ItemsPerThread, Algorithm>
+    <<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(d_samples.data()),
+                            thrust::raw_pointer_cast(d_histogram.data()));
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+// %PARAM% TEST_BINS bins 32:256:1024
+
+using types = c2h::type_list<std::uint8_t, std::uint16_t>;
+using threads_in_block = c2h::enum_type_list<int, 32, 96, 128>;
+using items_per_thread = c2h::enum_type_list<int, 1, 5>;
+using bins             = c2h::enum_type_list<int, TEST_BINS>;
+using algorithms = c2h::enum_type_list<cub::BlockHistogramAlgorithm,
+                                       cub::BLOCK_HISTO_SORT,
+                                       cub::BLOCK_HISTO_ATOMIC>;
+
+template <class TestType>
+struct params_t
+{
+  using sample_t = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
+  static constexpr int bins = c2h::get<3, TestType>::value;
+  static constexpr int num_samples = threads_in_block * items_per_thread;
+  static constexpr cub::BlockHistogramAlgorithm algorithm =
+    c2h::get<4, TestType>::value;
+};
+
+CUB_TEST("Block histogram can be computed with uniform input",
+         "[histogram][block]",
+         types,
+         items_per_thread,
+         threads_in_block,
+         bins,
+         algorithms)
+{
+  using params   = params_t<TestType>;
+  using sample_t = typename params::sample_t;
+
+  const sample_t uniform_value =
+    static_cast<sample_t>(GENERATE_COPY(take(10, random(0, params::bins - 1))));
+
+  thrust::host_vector<sample_t> h_samples(params::num_samples, uniform_value);
+  thrust::host_vector<int> h_reference(params::bins);
+  h_reference[static_cast<std::size_t>(uniform_value)] = params::num_samples;
+
+  // Allocate problem device arrays
+  thrust::device_vector<sample_t> d_samples = h_samples;
+  thrust::device_vector<int> d_histogram(params::bins);
+
+  // Run kernel
+  block_histogram<params::items_per_thread,
+                  params::threads_in_block,
+                  params::bins,
+                  params::algorithm>(d_samples, d_histogram);
+
+  REQUIRE(h_reference == d_histogram);
+}
+
+template <typename SampleT>
+thrust::host_vector<int>
+compute_host_reference(int bins, const thrust::host_vector<SampleT> &h_samples)
+{
+  thrust::host_vector<int> h_reference(bins);
+  for (const SampleT &sample : h_samples)
+  {
+    h_reference[sample]++;
+  }
+
+  return h_reference;
+}
+
+CUB_TEST("Block histogram can be computed with modulo input",
+         "[histogram][block]",
+         types,
+         items_per_thread,
+         threads_in_block,
+         bins,
+         algorithms)
+{
+  using params   = params_t<TestType>;
+  using sample_t = typename params::sample_t;
+
+  // Allocate problem device arrays
+  thrust::device_vector<int> d_histogram(params::bins);
+  thrust::device_vector<sample_t> d_samples(params::num_samples);
+
+  c2h::gen(c2h::modulo_t{params::bins}, d_samples);
+
+  thrust::host_vector<sample_t> h_samples = d_samples;
+  auto h_reference = compute_host_reference(params::bins, h_samples);
+
+  // Run kernel
+  block_histogram<params::items_per_thread,
+                  params::threads_in_block,
+                  params::bins,
+                  params::algorithm>(d_samples, d_histogram);
+
+  REQUIRE(h_reference == d_histogram);
+}
+
+CUB_TEST("Block histogram can be computed with random input",
+         "[histogram][block]",
+         types,
+         items_per_thread,
+         threads_in_block,
+         bins,
+         algorithms)
+{
+  using params   = params_t<TestType>;
+  using sample_t = typename params::sample_t;
+
+  // Allocate problem device arrays
+  thrust::device_vector<int> d_histogram(params::bins);
+  thrust::device_vector<sample_t> d_samples(params::num_samples);
+
+  const sample_t min_bin = static_cast<sample_t>(0);
+  const sample_t max_bin = 
+    static_cast<sample_t>(
+      std::min(static_cast<std::int32_t>(std::numeric_limits<sample_t>::max()),
+               static_cast<std::int32_t>(params::bins - 1)));
+
+  c2h::gen(CUB_SEED(10), d_samples, min_bin, max_bin);
+
+  thrust::host_vector<sample_t> h_samples = d_samples;
+  auto h_reference = compute_host_reference(params::bins, h_samples);
+
+  // Run kernel
+  block_histogram<params::items_per_thread,
+                  params::threads_in_block,
+                  params::bins,
+                  params::algorithm>(d_samples, d_histogram);
+
+  REQUIRE(h_reference == d_histogram);
+}
+
diff --git a/include/cub/test/catch2_test_block_load.cu b/include/cub/test/catch2_test_block_load.cu
new file mode 100644
index 0000000..7bd0e5b
--- /dev/null
+++ b/include/cub/test/catch2_test_block_load.cu
@@ -0,0 +1,326 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_load.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockLoadAlgorithm LoadAlgorithm>
+struct output_idx
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) * ItemsPerThread + item;
+  }
+};
+
+template <int ItemsPerThread, int ThreadsInBlock>
+struct output_idx<ItemsPerThread,
+                  ThreadsInBlock,
+                  cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED>
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) + ThreadsInBlock * item;
+  }
+};
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockLoadAlgorithm LoadAlgorithm>
+__global__ void kernel(std::integral_constant<bool, true>,
+                       InputIteratorT input,
+                       OutputIteratorT output,
+                       int num_items)
+{
+  using input_t      = cub::detail::value_t<InputIteratorT>;
+  using block_load_t = cub::BlockLoad<input_t,
+                                      ThreadsInBlock,
+                                      ItemsPerThread,
+                                      LoadAlgorithm>;
+  using storage_t =
+    typename block_load_t::TempStorage;
+
+  __shared__ storage_t storage;
+  block_load_t block_load(storage);
+
+  input_t data[ItemsPerThread];
+
+  if (ItemsPerThread * ThreadsInBlock == num_items)
+  {
+    block_load.Load(input, data);
+  }
+  else
+  {
+    block_load.Load(input, data, num_items);
+  }
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx =
+      output_idx<ItemsPerThread, ThreadsInBlock, LoadAlgorithm>::get(i);
+
+    if (idx < num_items)
+    {
+      output[idx] = data[i];
+    }
+  }
+}
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockLoadAlgorithm /* LoadAlgorithm */>
+__global__ void kernel(std::integral_constant<bool, false>,
+                       InputIteratorT input,
+                       OutputIteratorT output,
+                       int num_items)
+{
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx =
+      output_idx<ItemsPerThread,
+                 ThreadsInBlock,
+                 cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT>::get(i);
+
+    if (idx < num_items)
+    {
+      output[idx] = input[idx];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockLoadAlgorithm LoadAlgorithm,
+          typename InputIteratorT,
+          typename OutputIteratorT>
+void block_load(
+    InputIteratorT input,
+    OutputIteratorT output,
+    int num_items)
+{
+  using input_t = cub::detail::value_t<InputIteratorT>;
+  using block_load_t = cub::BlockLoad<input_t,
+                                      ThreadsInBlock,
+                                      ItemsPerThread,
+                                      LoadAlgorithm>;
+  using storage_t = typename block_load_t::TempStorage;
+  constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48;
+
+  kernel<InputIteratorT,
+         OutputIteratorT,
+         ItemsPerThread,
+         ThreadsInBlock,
+         LoadAlgorithm><<<1, ThreadsInBlock>>>(
+             std::integral_constant<bool, sufficient_resources>{},
+             input, output, num_items);
+
+  REQUIRE( cudaSuccess == cudaPeekAtLastError() );
+  REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
+}
+
+
+// %PARAM% IPT it 1:11
+
+using types = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t>;
+using vec_types = c2h::type_list<long2, double2>;
+
+using even_threads_in_block = c2h::enum_type_list<int, 32, 128>;
+using odd_threads_in_block = c2h::enum_type_list<int, 15, 65>;
+using a_block_size = c2h::enum_type_list<int, 256>;
+
+using items_per_thread = c2h::enum_type_list<int, IPT>;
+using load_algorithm   = c2h::enum_type_list<
+  cub::BlockLoadAlgorithm,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE,             
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED>;
+
+using odd_load_algorithm = c2h::enum_type_list<
+  cub::BlockLoadAlgorithm,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
+  cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
+  static constexpr int tile_size = items_per_thread * threads_in_block;
+  static constexpr cub::BlockLoadAlgorithm load_algorithm = 
+    c2h::get<3, TestType>::value;
+};
+
+CUB_TEST("Block load works with even block sizes",
+         "[load][block]",
+         types,
+         items_per_thread,
+         even_threads_in_block,
+         load_algorithm)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_load<params::items_per_thread,
+             params::threads_in_block,
+             params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
+                                     thrust::raw_pointer_cast(d_output.data()),
+                                     static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
+CUB_TEST("Block load works with even odd sizes",
+         "[load][block]",
+         types,
+         items_per_thread,
+         odd_threads_in_block,
+         odd_load_algorithm)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_load<params::items_per_thread,
+             params::threads_in_block,
+             params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
+                                     thrust::raw_pointer_cast(d_output.data()),
+                                     static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
+CUB_TEST("Block load works with even vector types",
+         "[load][block]",
+         vec_types,
+         items_per_thread,
+         a_block_size,
+         load_algorithm)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_load<params::items_per_thread,
+             params::threads_in_block,
+             params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
+                                     thrust::raw_pointer_cast(d_output.data()),
+                                     static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
+CUB_TEST("Block load works with custom types",
+         "[load][block]",
+         items_per_thread,
+         load_algorithm)
+{
+  using type = c2h::custom_type_t<c2h::equal_comparable_t>;
+  constexpr int items_per_thread = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block = 64;
+  constexpr int tile_size = items_per_thread * threads_in_block;
+  static constexpr cub::BlockLoadAlgorithm load_algorithm = 
+    c2h::get<1, TestType>::value;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_load<items_per_thread,
+             threads_in_block,
+             load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
+                             thrust::raw_pointer_cast(d_output.data()),
+                             static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
+CUB_TEST("Block load works with caching iterators",
+         "[load][block]",
+         items_per_thread,
+         load_algorithm)
+{
+  using type = int;
+  constexpr int items_per_thread = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block = 64;
+  constexpr int tile_size = items_per_thread * threads_in_block;
+  static constexpr cub::BlockLoadAlgorithm load_algorithm = 
+    c2h::get<1, TestType>::value;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_DEFAULT, type> in(
+    thrust::raw_pointer_cast(d_input.data()));
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_load<items_per_thread,
+             threads_in_block,
+             load_algorithm>(in,
+                             thrust::raw_pointer_cast(d_output.data()),
+                             static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
diff --git a/include/cub/test/catch2_test_block_merge_sort.cu b/include/cub/test/catch2_test_block_merge_sort.cu
new file mode 100644
index 0000000..ab61a39
--- /dev/null
+++ b/include/cub/test/catch2_test_block_merge_sort.cu
@@ -0,0 +1,520 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockMergeSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+#include <cub/block/block_merge_sort.cuh>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+struct CustomLess
+{
+  template <typename DataType>
+  __device__ __host__ bool operator()(const DataType &lhs, const DataType &rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+template <int ThreadsInBlock,
+          int ItemsPerThread,
+          class KeyT,
+          class ActionT>
+__global__ void block_merge_sort_kernel(
+    KeyT *data, 
+    int valid_items,
+    KeyT oob_default,
+    ActionT action)
+{
+  using BlockMergeSort =
+    cub::BlockMergeSort<KeyT, ThreadsInBlock, ItemsPerThread>;
+
+  __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+
+  KeyT thread_data[ItemsPerThread];
+
+  const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+    thread_data[item] = idx < valid_items ? data[idx] : KeyT();
+  }
+  __syncthreads();
+
+  BlockMergeSort sort(temp_storage_shuffle);
+
+  action(sort, thread_data, valid_items, oob_default);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+
+    if (idx >= valid_items)
+    {
+      break;
+    }
+
+    data[idx] = thread_data[item];
+  }
+}
+
+template <int ThreadsInBlock,
+          int ItemsPerThread,
+          class KeyT,
+          class ValueT,
+          class ActionT>
+__global__ void block_merge_sort_kernel(
+    KeyT *keys, 
+    ValueT *vals, 
+    int valid_items,
+    KeyT oob_default,
+    ActionT action)
+{
+  using BlockMergeSort =
+    cub::BlockMergeSort<KeyT, ThreadsInBlock, ItemsPerThread, ValueT>;
+
+  __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+
+  KeyT thread_keys[ItemsPerThread];
+  ValueT thread_vals[ItemsPerThread];
+
+  const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+    thread_keys[item] = idx < valid_items ? keys[idx] : KeyT{};
+    thread_vals[item] = idx < valid_items ? vals[idx] : ValueT{};
+  }
+  __syncthreads();
+
+  BlockMergeSort sort(temp_storage_shuffle);
+
+  action(sort, thread_keys, thread_vals, valid_items, oob_default);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+
+    if (idx >= valid_items)
+    {
+      break;
+    }
+
+    keys[idx] = thread_keys[item];
+    vals[idx] = thread_vals[item];
+  }
+}
+
+struct stable_sort_keys_partial_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class DefaultT>
+  __device__ void operator()(BlockMergeSortT &sort,
+                             KeyT &thread_data,
+                             int valid_items,
+                             DefaultT oob_default) const
+ {
+   sort.StableSort(thread_data, CustomLess{}, valid_items, oob_default);
+ }
+};
+
+struct stable_sort_pairs_partial_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class ValueT, class DefaultT>
+  __device__ void operator()(BlockMergeSortT &sort,
+                             KeyT &thread_keys,
+                             ValueT &thread_vals,
+                             int valid_items,
+                             DefaultT oob_default) const
+ {
+   sort.StableSort(thread_keys,
+                   thread_vals,
+                   CustomLess{},
+                   valid_items,
+                   oob_default);
+ }
+};
+
+struct stable_sort_pairs_full_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class ValueT, class DefaultT>
+  __device__ void operator()(BlockMergeSortT &sort,
+                             KeyT &thread_keys,
+                             ValueT &thread_vals,
+                             int /* valid_items */,
+                             DefaultT /* oob_default */) const
+ {
+   sort.StableSort(thread_keys, thread_vals, CustomLess());
+ }
+};
+
+struct stable_sort_keys_full_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class DefaultT>
+  __device__ void operator()(BlockMergeSortT &sort,
+                             KeyT &thread_keys,
+                             int /* valid_items */,
+                             DefaultT /* oob_default */) const
+ {
+   sort.StableSort(thread_keys, CustomLess());
+ }
+};
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          class KeyT,
+          class ActionT>
+void block_merge_sort(
+    thrust::device_vector<KeyT> &keys, 
+    ActionT action)
+{
+  block_merge_sort_kernel<ThreadsInBlock, ItemsPerThread>
+    <<<1, ThreadsInBlock>>>(
+        thrust::raw_pointer_cast(keys.data()),
+        static_cast<int>(keys.size()),
+        std::numeric_limits<KeyT>::max(),
+        action);
+
+  REQUIRE( cudaSuccess == cudaPeekAtLastError() );
+  REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          class KeyT,
+          class ValueT,
+          class ActionT>
+void block_merge_sort(
+    thrust::device_vector<KeyT> &keys, 
+    thrust::device_vector<ValueT> &vals, 
+    ActionT action)
+{
+  block_merge_sort_kernel<ThreadsInBlock, ItemsPerThread>
+    <<<1, ThreadsInBlock>>>(
+        thrust::raw_pointer_cast(keys.data()),
+        thrust::raw_pointer_cast(vals.data()),
+        static_cast<int>(keys.size()),
+        std::numeric_limits<KeyT>::max(),
+        action);
+
+  REQUIRE( cudaSuccess == cudaPeekAtLastError() );
+  REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
+}
+
+// %PARAM% THREADS_IN_BLOCK bs 64:256
+
+using key_types = c2h::type_list<std::int32_t, std::int64_t>;
+using threads_in_block = c2h::enum_type_list<int, THREADS_IN_BLOCK>;
+using items_per_thread = c2h::enum_type_list<int, 1, 2, 10, 15>;
+
+
+template <class TestType>
+struct params_t
+{
+  using key_t = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
+  static constexpr int tile_size = items_per_thread * threads_in_block;
+};
+
+
+CUB_TEST("Block merge sort can sort keys in partial tiles",
+         "[merge sort][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+
+  thrust::device_vector<key_t> d_keys(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+
+  c2h::gen(CUB_SEED(10), d_keys);
+
+  thrust::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(
+      thrust::raw_pointer_cast(h_reference.data()), 
+      thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(), 
+      CustomLess{});
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(
+    d_keys,
+    stable_sort_keys_partial_tile_t{});
+
+  REQUIRE( h_reference == d_keys );
+}
+
+CUB_TEST("Block merge sort can sort keys in full tiles",
+         "[merge sort][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+
+  thrust::device_vector<key_t> d_keys(params::tile_size);
+
+  c2h::gen(CUB_SEED(10), d_keys);
+
+  thrust::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(
+      thrust::raw_pointer_cast(h_reference.data()), 
+      thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(), 
+      CustomLess{});
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(
+    d_keys,
+    stable_sort_keys_full_tile_t{});
+
+  REQUIRE( h_reference == d_keys );
+}
+
+CUB_TEST("Block merge sort can sort pairs in partial tiles",
+         "[merge sort][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+  using value_t = key_t;
+  using pair_t = std::pair<key_t, value_t>;
+
+  thrust::device_vector<key_t> d_keys(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+  thrust::device_vector<value_t> d_vals(d_keys.size());
+
+  c2h::gen(CUB_SEED(5), d_keys);
+  c2h::gen(CUB_SEED(5), d_vals);
+
+  thrust::host_vector<key_t> h_keys = d_keys;
+  thrust::host_vector<value_t> h_vals = d_vals;
+
+  thrust::host_vector<pair_t> h_ref(d_keys.size());
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_ref[idx] = std::make_pair(h_keys[idx], h_vals[idx]);
+  }
+
+  std::stable_sort(
+      thrust::raw_pointer_cast(h_ref.data()), 
+      thrust::raw_pointer_cast(h_ref.data()) + h_ref.size(), 
+      [](pair_t l, pair_t r) -> bool {
+        return l.first < r.first;
+      });
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_keys[idx] = h_ref[idx].first;
+    h_vals[idx] = h_ref[idx].second;
+  }
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(
+    d_keys,
+    d_vals,
+    stable_sort_pairs_partial_tile_t{});
+
+  REQUIRE( h_keys == d_keys );
+  REQUIRE( h_vals == d_vals );
+}
+
+CUB_TEST("Block merge sort can sort pairs in full tiles",
+         "[merge sort][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block)
+{
+  using params = params_t<TestType>;
+  using key_t = typename params::key_t;
+  using value_t = key_t;
+  using pair_t = std::pair<key_t, value_t>;
+
+  thrust::device_vector<key_t> d_keys(params::tile_size);
+  thrust::device_vector<value_t> d_vals(d_keys.size());
+
+  c2h::gen(CUB_SEED(5), d_keys);
+  c2h::gen(CUB_SEED(5), d_vals);
+
+  thrust::host_vector<key_t> h_keys = d_keys;
+  thrust::host_vector<value_t> h_vals = d_vals;
+
+  thrust::host_vector<pair_t> h_ref(d_keys.size());
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_ref[idx] = std::make_pair(h_keys[idx], h_vals[idx]);
+  }
+
+  std::stable_sort(
+      thrust::raw_pointer_cast(h_ref.data()), 
+      thrust::raw_pointer_cast(h_ref.data()) + h_ref.size(), 
+      [](pair_t l, pair_t r) -> bool {
+        return l.first < r.first;
+      });
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_keys[idx] = h_ref[idx].first;
+    h_vals[idx] = h_ref[idx].second;
+  }
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(
+    d_keys,
+    d_vals,
+    stable_sort_pairs_full_tile_t{});
+
+  REQUIRE( h_keys == d_keys );
+  REQUIRE( h_vals == d_vals );
+}
+
+
+CUB_TEST("Block merge sort can sort pairs with mixed types",
+         "[merge sort][block]",
+         threads_in_block)
+{
+  using key_t = std::int32_t;
+  using value_t = std::int64_t;
+  using pair_t = std::pair<key_t, value_t>;
+
+  constexpr int items_per_thread = 2;
+  constexpr int threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int tile_size = items_per_thread * threads_in_block;
+
+  thrust::device_vector<key_t> d_keys(tile_size);
+  thrust::device_vector<value_t> d_vals(d_keys.size());
+
+  c2h::gen(CUB_SEED(5), d_keys);
+  c2h::gen(CUB_SEED(5), d_vals);
+
+  thrust::host_vector<key_t> h_keys = d_keys;
+  thrust::host_vector<value_t> h_vals = d_vals;
+
+  thrust::host_vector<pair_t> h_ref(d_keys.size());
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_ref[idx] = std::make_pair(h_keys[idx], h_vals[idx]);
+  }
+
+  std::stable_sort(
+      thrust::raw_pointer_cast(h_ref.data()), 
+      thrust::raw_pointer_cast(h_ref.data()) + h_ref.size(), 
+      [](pair_t l, pair_t r) -> bool {
+        return l.first < r.first;
+      });
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_keys[idx] = h_ref[idx].first;
+    h_vals[idx] = h_ref[idx].second;
+  }
+
+  block_merge_sort<items_per_thread, threads_in_block>(
+    d_keys,
+    d_vals,
+    stable_sort_pairs_full_tile_t{});
+
+  REQUIRE( h_keys == d_keys );
+  REQUIRE( h_vals == d_vals );
+}
+
+CUB_TEST("Block merge sort can sort large tiles",
+         "[merge sort][block]",
+         threads_in_block)
+{
+  using key_t = std::uint16_t;
+
+  constexpr int items_per_thread = 2;
+
+  // Repurpose block sizes
+  constexpr int cmake_threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block = cmake_threads_in_block < 256
+                                 ? 512
+                                 : 1024;
+
+  constexpr int tile_size = threads_in_block * items_per_thread;
+
+  thrust::device_vector<key_t> d_keys(tile_size);
+  c2h::gen(CUB_SEED(10), d_keys);
+
+  thrust::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(
+      thrust::raw_pointer_cast(h_reference.data()), 
+      thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(), 
+      CustomLess{});
+
+  block_merge_sort<items_per_thread, threads_in_block>(
+    d_keys,
+    stable_sort_keys_full_tile_t{});
+
+  REQUIRE( h_reference == d_keys );
+}
+
+CUB_TEST("Block merge sort is stable",
+         "[merge sort][block]",
+         threads_in_block)
+{
+  using key_t = c2h::custom_type_t<c2h::less_comparable_t,
+                                   c2h::equal_comparable_t>;
+
+  constexpr int items_per_thread = 2;
+  constexpr int threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int tile_size = threads_in_block * items_per_thread;
+
+  thrust::device_vector<key_t> d_keys(tile_size);
+  c2h::gen(CUB_SEED(10), d_keys);
+
+  thrust::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(
+      thrust::raw_pointer_cast(h_reference.data()), 
+      thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(), 
+      CustomLess{});
+
+  block_merge_sort<items_per_thread, threads_in_block>(
+    d_keys,
+    stable_sort_keys_full_tile_t{});
+
+  REQUIRE( h_reference == d_keys );
+}
+
diff --git a/include/cub/test/catch2_test_block_radix_sort.cu b/include/cub/test/catch2_test_block_radix_sort.cu
new file mode 100644
index 0000000..28b4d28
--- /dev/null
+++ b/include/cub/test/catch2_test_block_radix_sort.cu
@@ -0,0 +1,394 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "catch2_test_block_radix_sort.cuh"
+
+#include <algorithm>
+#include <type_traits>
+#include <utility>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+// %PARAM% TEST_MEMOIZE mem 0:1
+// %PARAM% TEST_ALGORITHM alg 0:1
+// %PARAM% TEST_IPT ipt 1:11
+// %PARAM% TEST_THREADS_IN_BLOCK ipt 32:160
+
+using types = c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>;
+using no_value_types = c2h::type_list<cub::NullType>;
+
+using key_types = c2h::type_list<std::int8_t, std::int16_t, std::int32_t, std::int64_t, float, double>;
+using value_types = c2h::type_list<std::int8_t, c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using threads_in_block = c2h::enum_type_list<int, TEST_THREADS_IN_BLOCK>;
+using items_per_thread = c2h::enum_type_list<int, TEST_IPT>;
+using radix_bits = c2h::enum_type_list<int, 1, 5>;
+using memoize = c2h::enum_type_list<bool, TEST_MEMOIZE>;
+
+#if TEST_ALGORITHM == 0
+using algorithm = c2h::enum_type_list<cub::BlockScanAlgorithm, 
+      cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING>;
+#else
+using algorithm = c2h::enum_type_list<cub::BlockScanAlgorithm, 
+      cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>; 
+#endif
+
+using shmem_config = c2h::enum_type_list<cudaSharedMemConfig,
+                                         cudaSharedMemBankSizeFourByte,
+                                         cudaSharedMemBankSizeEightByte>;
+
+using shmem_config_4 =
+  c2h::enum_type_list<cudaSharedMemConfig, cudaSharedMemBankSizeFourByte>;
+
+template <class TestType>
+struct params_t
+{
+  using key_type = typename c2h::get<0, TestType>;
+  using value_type = typename c2h::get<1, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<2, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<3, TestType>::value;
+  static constexpr int tile_size = items_per_thread * threads_in_block;
+  static constexpr int radix_bits = c2h::get<4, TestType>::value;
+  static constexpr bool memoize = c2h::get<5, TestType>::value;
+  static constexpr cub::BlockScanAlgorithm algorithm = 
+    c2h::get<6, TestType>::value;
+  static constexpr cudaSharedMemConfig shmem_config = 
+    c2h::get<7, TestType>::value;
+};
+
+CUB_TEST("Block radix sort can sort keys",
+         "[radix][sort][block]",
+         types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config)
+{
+  using params = params_t<TestType>;
+  using type = typename params::key_type;
+
+  thrust::device_vector<type> d_output(params::tile_size);
+  thrust::device_vector<type> d_input(params::tile_size);
+  c2h::gen(CUB_SEED(2), d_input);
+
+  const int key_size  = static_cast<int>(sizeof(type) * 8);
+  const int begin_bit = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit   = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped  = GENERATE_COPY(false, true);
+
+  const bool is_descending = false;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    sort_op_t{},
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    begin_bit,
+    end_bit, 
+    striped);
+
+  thrust::host_vector<type> h_reference =
+    radix_sort_reference(d_input, is_descending, begin_bit, end_bit);
+
+  INFO( "striped = " << striped );
+  REQUIRE( h_reference == d_output );
+}
+
+CUB_TEST("Block radix sort can sort keys in descending order",
+         "[radix][sort][block]",
+         types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config)
+{
+  using params = params_t<TestType>;
+  using type = typename params::key_type;
+
+  thrust::device_vector<type> d_output(params::tile_size);
+  thrust::device_vector<type> d_input(params::tile_size);
+  c2h::gen(CUB_SEED(2), d_input);
+
+  const int key_size  = static_cast<int>(sizeof(type) * 8);
+  const int begin_bit = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit   = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped  = GENERATE_COPY(false, true);
+
+  const bool is_descending = true;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    descending_sort_op_t{},
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  thrust::host_vector<type> h_reference =
+    radix_sort_reference(d_input, is_descending, begin_bit, end_bit);
+
+  REQUIRE( h_reference == d_output );
+}
+
+CUB_TEST("Block radix sort can sort pairs",
+         "[radix][sort][block]",
+         key_types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params = params_t<TestType>;
+  using key_type = typename params::key_type;
+  using value_type = key_type; 
+
+  thrust::device_vector<key_type> d_output_keys(params::tile_size);
+  thrust::device_vector<value_type> d_output_values(params::tile_size);
+  thrust::device_vector<key_type> d_input_keys(params::tile_size);
+  thrust::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(CUB_SEED(2), d_input_keys);
+  c2h::gen(CUB_SEED(2), d_input_values);
+
+  const int key_size  = static_cast<int>(sizeof(key_type) * 8);
+  const int begin_bit = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit   = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped  = GENERATE_COPY(false, true);
+
+  const bool is_descending = false;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<thrust::host_vector<key_type>, thrust::host_vector<value_type>>
+    h_reference = radix_sort_reference(d_input_keys,
+                                       d_input_values,
+                                       is_descending,
+                                       begin_bit,
+                                       end_bit);
+
+  REQUIRE( h_reference.first == d_output_keys );
+  REQUIRE( h_reference.second == d_output_values );
+}
+
+CUB_TEST("Block radix sort can sort pairs in descending order",
+         "[radix][sort][block]",
+         key_types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params = params_t<TestType>;
+  using key_type = typename params::key_type;
+  using value_type = key_type; 
+
+  thrust::device_vector<key_type> d_output_keys(params::tile_size);
+  thrust::device_vector<value_type> d_output_values(params::tile_size);
+  thrust::device_vector<key_type> d_input_keys(params::tile_size);
+  thrust::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(CUB_SEED(2), d_input_keys);
+  c2h::gen(CUB_SEED(2), d_input_values);
+
+  const int key_size  = static_cast<int>(sizeof(key_type) * 8);
+  const int begin_bit = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit   = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped  = GENERATE_COPY(false, true);
+
+  const bool is_descending = true;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    descending_sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<thrust::host_vector<key_type>, thrust::host_vector<value_type>>
+    h_reference = radix_sort_reference(d_input_keys,
+                                       d_input_values,
+                                       is_descending,
+                                       begin_bit,
+                                       end_bit);
+
+  REQUIRE( h_reference.first == d_output_keys );
+  REQUIRE( h_reference.second == d_output_values );
+}
+
+CUB_TEST("Block radix sort can sort mixed pairs",
+         "[radix][sort][block]",
+         key_types,
+         value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params = params_t<TestType>;
+  using key_type = typename params::key_type;
+  using value_type = typename params::value_type; 
+
+  thrust::device_vector<key_type> d_output_keys(params::tile_size);
+  thrust::device_vector<value_type> d_output_values(params::tile_size);
+  thrust::device_vector<key_type> d_input_keys(params::tile_size);
+  thrust::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(CUB_SEED(2), d_input_keys);
+  c2h::gen(CUB_SEED(2), d_input_values);
+
+  const int key_size  = static_cast<int>(sizeof(key_type) * 8);
+  const int begin_bit = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit   = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped  = GENERATE_COPY(false, true);
+
+  const bool is_descending = false;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<thrust::host_vector<key_type>, thrust::host_vector<value_type>>
+    h_reference = radix_sort_reference(d_input_keys,
+                                       d_input_values,
+                                       is_descending,
+                                       begin_bit,
+                                       end_bit);
+
+  REQUIRE( h_reference.first == d_output_keys );
+  REQUIRE( h_reference.second == d_output_values );
+}
+
+CUB_TEST("Block radix sort can sort mixed pairs in descending order",
+         "[radix][sort][block]",
+         key_types,
+         value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params = params_t<TestType>;
+  using key_type = typename params::key_type;
+  using value_type = typename params::value_type; 
+
+  thrust::device_vector<key_type> d_output_keys(params::tile_size);
+  thrust::device_vector<value_type> d_output_values(params::tile_size);
+  thrust::device_vector<key_type> d_input_keys(params::tile_size);
+  thrust::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(CUB_SEED(2), d_input_keys);
+  c2h::gen(CUB_SEED(2), d_input_values);
+
+  const int key_size  = static_cast<int>(sizeof(key_type) * 8);
+  const int begin_bit = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit   = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped  = GENERATE_COPY(false, true);
+
+  const bool is_descending = true;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    descending_sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<thrust::host_vector<key_type>, thrust::host_vector<value_type>>
+    h_reference = radix_sort_reference(d_input_keys,
+                                       d_input_values,
+                                       is_descending,
+                                       begin_bit,
+                                       end_bit);
+
+  REQUIRE( h_reference.first == d_output_keys );
+  REQUIRE( h_reference.second == d_output_values );
+}
+
diff --git a/include/cub/test/catch2_test_block_radix_sort.cuh b/include/cub/test/catch2_test_block_radix_sort.cuh
new file mode 100644
index 0000000..a0be706
--- /dev/null
+++ b/include/cub/test/catch2_test_block_radix_sort.cuh
@@ -0,0 +1,457 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/block/block_radix_sort.cuh>
+
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ActionT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig>
+__global__ void kernel(
+    ActionT action, 
+    InputIteratorT input, 
+    OutputIteratorT output,
+    int begin_bit,
+    int end_bit,
+    bool striped)
+{
+  using key_t = cub::detail::value_t<InputIteratorT>;
+  using block_radix_sort_t = cub::BlockRadixSort<key_t,
+                                                 ThreadsInBlock,
+                                                 ItemsPerThread,
+                                                 cub::NullType,
+                                                 RadixBits,
+                                                 Memoize,
+                                                 Algorithm,
+                                                 ShmemConfig>;
+
+  using storage_t = typename block_radix_sort_t::TempStorage;
+
+  __shared__ storage_t storage;
+
+  key_t keys[ItemsPerThread];
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    keys[i] = input[threadIdx.x * ItemsPerThread + i];
+  }
+
+  block_radix_sort_t block_radix_sort(storage);
+
+  if (striped)
+  {
+    action(block_radix_sort,
+           keys,
+           begin_bit,
+           end_bit,
+           cub::Int2Type<1>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output[threadIdx.x + ThreadsInBlock * i] = keys[i];
+    }
+  }
+  else
+  {
+    action(block_radix_sort,
+           keys,
+           begin_bit,
+           end_bit,
+           cub::Int2Type<0>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output[threadIdx.x * ItemsPerThread + i] = keys[i];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ActionT>
+void block_radix_sort(
+    ActionT action,
+    InputIteratorT input,
+    OutputIteratorT output,
+    int begin_bit,
+    int end_bit,
+    bool striped)
+{
+  cudaDeviceSetSharedMemConfig(ShmemConfig);
+
+  kernel<InputIteratorT,
+         OutputIteratorT,
+         ActionT,
+         ItemsPerThread,
+         ThreadsInBlock,
+         RadixBits,
+         Memoize,
+         Algorithm,
+         ShmemConfig>
+    <<<1, ThreadsInBlock>>>(action, input, output, begin_bit, end_bit, striped);
+
+  REQUIRE( cudaSuccess == cudaPeekAtLastError() );
+  REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
+}
+
+template <typename InputKeyIteratorT,
+          typename InputValueIteratorT,
+          typename OutputKeyIteratorT,
+          typename OutputValueIteratorT,
+          typename ActionT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig>
+__global__ void kernel(
+    ActionT action, 
+    InputKeyIteratorT input_keys, 
+    InputValueIteratorT input_values,
+    OutputKeyIteratorT output_keys, 
+    OutputValueIteratorT output_values,
+    int begin_bit,
+    int end_bit,
+    bool striped)
+{
+  using key_t = cub::detail::value_t<InputKeyIteratorT>;
+  using value_t = cub::detail::value_t<InputValueIteratorT>;
+  using block_radix_sort_t = cub::BlockRadixSort<key_t,
+                                                 ThreadsInBlock,
+                                                 ItemsPerThread,
+                                                 value_t,
+                                                 RadixBits,
+                                                 Memoize,
+                                                 Algorithm,
+                                                 ShmemConfig>;
+
+  using storage_t = typename block_radix_sort_t::TempStorage;
+  __shared__ storage_t storage;
+
+  key_t keys[ItemsPerThread];
+  value_t values[ItemsPerThread];
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    keys[i] = input_keys[threadIdx.x * ItemsPerThread + i];
+    values[i] = input_values[threadIdx.x * ItemsPerThread + i];
+  }
+
+  block_radix_sort_t block_radix_sort(storage);
+
+  if (striped)
+  {
+    action(block_radix_sort,
+           keys,
+           values,
+           begin_bit,
+           end_bit,
+           cub::Int2Type<1>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output_keys[threadIdx.x + ThreadsInBlock * i] = keys[i];
+      output_values[threadIdx.x + ThreadsInBlock * i] = values[i];
+    }
+  }
+  else 
+  {
+    action(block_radix_sort,
+           keys,
+           values,
+           begin_bit,
+           end_bit,
+           cub::Int2Type<0>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output_keys[threadIdx.x * ItemsPerThread + i] = keys[i];
+      output_values[threadIdx.x * ItemsPerThread + i] = values[i];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig,
+          typename InputKeyIteratorT,
+          typename InputValueIteratorT,
+          typename OutputKeyIteratorT,
+          typename OutputValueIteratorT,
+          typename ActionT>
+void block_radix_sort(
+    ActionT action,
+    InputKeyIteratorT input_keys,
+    InputValueIteratorT input_values,
+    OutputKeyIteratorT output_keys,
+    OutputValueIteratorT output_values,
+    int begin_bit,
+    int end_bit,
+    bool striped)
+{
+  cudaDeviceSetSharedMemConfig(ShmemConfig);
+
+  kernel<InputKeyIteratorT,
+         InputValueIteratorT,
+         OutputKeyIteratorT,
+         OutputValueIteratorT,
+         ActionT,
+         ItemsPerThread,
+         ThreadsInBlock,
+         RadixBits,
+         Memoize,
+         Algorithm,
+         ShmemConfig><<<1, ThreadsInBlock>>>(action,
+                                             input_keys,
+                                             input_values,
+                                             output_keys,
+                                             output_values,
+                                             begin_bit,
+                                             end_bit,
+                                             striped);
+
+  REQUIRE( cudaSuccess == cudaPeekAtLastError() );
+  REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
+}
+
+struct sort_op_t
+{
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.Sort(keys, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortBlockedToStriped(keys, begin_bit, end_bit);
+  }
+};
+
+struct descending_sort_op_t
+{
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.SortDescending(keys, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortDescendingBlockedToStriped(keys, begin_bit, end_bit);
+  }
+};
+
+struct sort_pairs_op_t
+{
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             ValuesT &values,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.Sort(keys, values, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             ValuesT &values,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortBlockedToStriped(keys, values, begin_bit, end_bit);
+  }
+};
+
+struct descending_sort_pairs_op_t
+{
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             ValuesT &values,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.SortDescending(keys, values, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(BlockRadixSortT &block_radix_sort,
+                             KeysT &keys,
+                             ValuesT &values,
+                             int begin_bit,
+                             int end_bit,
+                             cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortDescendingBlockedToStriped(keys,
+                                                    values,
+                                                    begin_bit,
+                                                    end_bit);
+  }
+};
+
+template <class KeyT>
+thrust::host_vector<KeyT>
+get_striped_keys(const thrust::host_vector<KeyT> &h_keys,
+                 int begin_bit,
+                 int end_bit)
+{
+  thrust::host_vector<KeyT> h_striped_keys(h_keys);
+  KeyT *h_striped_keys_data = thrust::raw_pointer_cast(h_striped_keys.data());
+
+  if ((begin_bit > 0) || (end_bit < static_cast<int>(sizeof(KeyT) * 8)))
+  {
+    const int num_bits = end_bit - begin_bit;
+
+    for (std::size_t i = 0; i < h_keys.size(); i++)
+    {
+      unsigned long long base = 0;
+      memcpy(&base, h_striped_keys_data + i, sizeof(KeyT));
+      base &= ((1ULL << num_bits) - 1) << begin_bit;
+      memcpy(h_striped_keys_data + i, &base, sizeof(KeyT));
+    }
+  }
+
+  return h_striped_keys;
+}
+
+template <class KeyT>
+thrust::host_vector<std::size_t>
+get_permutation(const thrust::host_vector<KeyT> &h_keys,
+                bool is_descending,
+                int begin_bit,
+                int end_bit)
+{
+  thrust::host_vector<KeyT> h_striped_keys =
+    get_striped_keys(h_keys, begin_bit, end_bit);
+
+  thrust::host_vector<std::size_t> h_permutation(h_keys.size());
+  thrust::sequence(h_permutation.begin(), h_permutation.end());
+
+  std::stable_sort(h_permutation.begin(),
+                   h_permutation.end(),
+                   [&](std::size_t a, std::size_t b) {
+                     if (is_descending)
+                     {
+                       return h_striped_keys[a] > h_striped_keys[b];
+                     }
+
+                     return h_striped_keys[a] < h_striped_keys[b];
+                   });
+
+  return h_permutation;
+}
+
+template <class KeyT>
+thrust::host_vector<KeyT>
+radix_sort_reference(const thrust::device_vector<KeyT> &d_keys,
+                     bool is_descending,
+                     int begin_bit,
+                     int end_bit)
+{
+  thrust::host_vector<KeyT> h_keys(d_keys);
+  thrust::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, begin_bit, end_bit);
+  thrust::host_vector<KeyT> result(d_keys.size());
+  thrust::gather(h_permutation.cbegin(), h_permutation.cend(), h_keys.cbegin(), result.begin());
+
+  return result;
+}
+
+template <class KeyT, class ValueT>
+std::pair<thrust::host_vector<KeyT>, thrust::host_vector<ValueT>>
+radix_sort_reference(const thrust::device_vector<KeyT> &d_keys,
+                     const thrust::device_vector<ValueT> &d_values,
+                     bool is_descending,
+                     int begin_bit,
+                     int end_bit)
+{
+  std::pair<thrust::host_vector<KeyT>, thrust::host_vector<ValueT>> result;
+  result.first.resize(d_keys.size());
+  result.second.resize(d_keys.size());
+
+  thrust::host_vector<KeyT> h_keys(d_keys);
+  thrust::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, begin_bit, end_bit);
+
+  thrust::host_vector<ValueT> h_values(d_values);
+  thrust::gather(h_permutation.cbegin(),
+                 h_permutation.cend(),
+                 thrust::make_zip_iterator(h_keys.cbegin(), h_values.cbegin()),
+                 thrust::make_zip_iterator(result.first.begin(), result.second.begin()));
+
+  return result;
+}
diff --git a/include/cub/test/catch2_test_block_radix_sort_custom.cu b/include/cub/test/catch2_test_block_radix_sort_custom.cu
new file mode 100644
index 0000000..cf80ef6
--- /dev/null
+++ b/include/cub/test/catch2_test_block_radix_sort_custom.cu
@@ -0,0 +1,1060 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#undef NDEBUG
+#include <cassert>
+
+#include "catch2_test_block_radix_sort.cuh"
+#include "cub/block/radix_rank_sort_operations.cuh"
+
+#include <algorithm>
+#include <type_traits>
+#include <utility>
+
+// example-begin custom-type
+struct custom_t
+{
+  float f;
+  int unused;
+  long long int lli;
+
+  custom_t() = default;
+  __device__ custom_t(float f, long long int lli)
+      : f(f)
+      , unused(42)
+      , lli(lli)
+  {}
+};
+
+static __device__ bool operator==(const custom_t &lhs, const custom_t &rhs)
+{
+  return lhs.f == rhs.f && lhs.lli == rhs.lli;
+}
+
+struct decomposer_t
+{
+  __device__ ::cuda::std::tuple<float &, long long int &> //
+  operator()(custom_t &key) const
+  {
+    return {key.f, key.lli};
+  }
+};
+// example-end custom-type
+
+__global__ void sort_keys()
+{
+  // example-begin keys
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5}  //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).Sort(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {+0.0, 1}, //
+       {-0.0, 2}  //
+     },
+     {
+       // thread 1 expected keys
+       {+1.1, 3}, //
+       {+2.5, 4}, //
+       {+3.7, 5}  //
+     }};
+  // example-end keys
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_bits()
+{
+  // example-begin keys-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 1 key each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {24.2, 1ll << 61} // thread 0 keys
+     },
+     {
+       {42.4, 1ll << 60} // thread 1 keys
+     }};
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .Sort(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{
+       {42.4, 1ll << 60}, // thread 0 expected keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 expected keys
+     }};
+  // example-end keys-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+}
+
+__global__ void sort_keys_descending()
+{
+  // example-begin keys-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0}  //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortDescending(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+2.5, 1}, //
+       {+1.1, 2}, //
+     },
+     {
+       // thread 1 expected keys
+       {-0.0, 4}, //
+       {+0.0, 3}, //
+       {-2.5, 5}  //
+     }};
+  // example-end keys-descending
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_descending_bits()
+{
+  // example-begin keys-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 1 key each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {42.4, 1ll << 60} // thread 0 keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 keys
+     }};
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+  // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescending(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{
+       {24.2, 1ll << 61}, // thread 0 expected keys
+     },
+     {
+       {42.4, 1ll << 60}  // thread 1 expected keys
+     }};
+  // example-end keys-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+}
+
+__global__ void sort_pairs()
+{
+  // example-begin pairs
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5}  //
+     }};
+
+  int thread_values[2][3] = //
+    {{4, 0, 3},             // thread 0 values
+     {1, 2, 5}};            // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .Sort(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{});
+
+  custom_t expected_keys[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {+0.0, 1}, //
+       {-0.0, 2}  //
+     },
+     {
+       // thread 1 expected keys
+       {+1.1, 3}, //
+       {+2.5, 4}, //
+       {+3.7, 5}  //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 1, 2},  // thread 0 expected values
+     {3, 4, 5}}; // thread 1 expected values
+  // example-end pairs
+
+  assert(thread_keys[threadIdx.x][0] == expected_keys[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_keys[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_keys[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_bits()
+{
+  // example-begin pairs-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {24.2, 1ll << 61} // thread 0 keys
+     },
+     {
+       {42.4, 1ll << 60} // thread 1 keys
+     }};
+
+  int thread_values[2][1] = //
+    {{1},  // thread 0 values
+     {0}}; // thread 1 values
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .Sort(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_keys[2][3] = //
+    {{
+       {42.4, 1ll << 60}, // thread 0 expected keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 expected keys
+     }};
+
+  int expected_values[2][1] = //
+    {{0},  // thread 0 values
+     {1}}; // thread 1 values
+  // example-end pairs-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_keys[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+}
+
+__global__ void sort_pairs_descending()
+{
+  // example-begin pairs-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0}  //
+     }};
+
+  int thread_values[2][3] = //
+    {{2, 1, 4},  // thread 0 values
+     {3, 5, 0}}; // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescending(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{});
+
+  custom_t expected_keys[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+2.5, 1}, //
+       {+1.1, 2}, //
+     },
+     {
+       // thread 1 expected keys
+       {-0.0, 4}, //
+       {+0.0, 3}, //
+       {-2.5, 5}  //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 1, 2},  // thread 0 expected values
+     {4, 3, 5}}; // thread 1 expected values
+  // example-end pairs-descending
+
+  assert(thread_keys[threadIdx.x][0] == expected_keys[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_keys[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_keys[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_descending_bits()
+{
+  // example-begin pairs-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {42.4, 1ll << 60} // thread 0 keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 keys
+     }};
+
+  int thread_values[2][1] = //
+    {{1},  // thread 0 values
+     {0}}; // thread 1 values
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+  // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescending(thread_keys[threadIdx.x],
+                    thread_values[threadIdx.x],
+                    decomposer_t{},
+                    begin_bit,
+                    end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{
+       {24.2, 1ll << 61}, // thread 0 expected keys
+     },
+     {
+       {42.4, 1ll << 60}  // thread 1 expected keys
+     }};
+
+  int expected_values[2][1] = //
+    {{0},  // thread 0 expected values
+     {1}}; // thread 1 expected values
+  // example-end pairs-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+}
+
+__global__ void sort_keys_blocked_to_striped()
+{
+  // example-begin keys-striped
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5}  //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {-0.0, 2}, //
+       {+2.5, 4}  //
+     },
+     {
+       // thread 1 expected keys
+       {+0.0, 1}, //
+       {+1.1, 3}, //
+       {+3.7, 5}  //
+     }};
+  // example-end keys-striped
+  
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_blocked_to_striped_bits()
+{
+  // example-begin keys-striped-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{ // thread 0 keys
+       {24.2, 1ll << 62}, 
+       {42.4, 1ll << 61} 
+     },
+     { // thread 1 keys
+       {42.4, 1ll << 60}, 
+       {24.2, 1ll << 59}  
+     }};
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{ // thread 0 expected keys
+       {24.2, 1ll << 59}, 
+       {42.4, 1ll << 61}
+     },
+     { // thread 1 expected keys
+       {42.4, 1ll << 60},
+       {24.2, 1ll << 62} 
+     }};
+  // example-end keys-striped-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+}
+
+__global__ void sort_pairs_blocked_to_striped()
+{
+  // example-begin pairs-striped
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5}  //
+     }};
+
+  int thread_values[2][3] = //
+    {{4, 0, 3},  // thread 0 values
+     {1, 2, 5}}; // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortBlockedToStriped(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {-0.0, 2}, //
+       {+2.5, 4}  //
+     },
+     {
+       // thread 1 expected keys
+       {+0.0, 1}, //
+       {+1.1, 3}, //
+       {+3.7, 5}  //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 2, 4},  // thread 0 values
+     {1, 3, 5}}; // thread 1 values
+  // example-end pairs-striped
+  
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_blocked_to_striped_bits()
+{
+  // example-begin pairs-striped-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{ // thread 0 keys
+       {24.2, 1ll << 62}, 
+       {42.4, 1ll << 61} 
+     },
+     { // thread 1 keys
+       {42.4, 1ll << 60}, 
+       {24.2, 1ll << 59}  
+     }};
+
+  int thread_values[2][2] = //
+    {{3, 2},  // thread 0 values
+     {1, 0}}; // thread 1 values
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortBlockedToStriped(thread_keys[threadIdx.x],
+                          thread_values[threadIdx.x],
+                          decomposer_t{},
+                          begin_bit,
+                          end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{ // thread 0 expected keys
+       {24.2, 1ll << 59}, 
+       {42.4, 1ll << 61}
+     },
+     { // thread 1 expected keys
+       {42.4, 1ll << 60},
+       {24.2, 1ll << 62} 
+     }};
+
+  int expected_values[2][2] = //
+    {{0, 2},  // thread 0 values
+     {1, 3}}; // thread 1 values
+  // example-end pairs-striped-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+}
+
+__global__ void sort_keys_descending_blocked_to_striped()
+{
+  // example-begin keys-striped-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0}  //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescendingBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+1.1, 2}, //
+       {+0.0, 3}  //
+     },
+     {
+       // thread 1 expected keys
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+       {-2.5, 5}  //
+     }};
+  // example-end keys-striped-descending
+  
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_descending_blocked_to_striped_bits()
+{
+  // example-begin keys-striped-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{ // thread 0 keys
+       {24.2, 1ll << 62}, 
+       {42.4, 1ll << 61} 
+     },
+     { // thread 1 keys
+       {42.4, 1ll << 60}, 
+       {24.2, 1ll << 59}  
+     }};
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescendingBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][2] = //
+    {{ // thread 0 expected keys
+       {24.2, 1ll << 62}, //
+       {42.4, 1ll << 60}  //
+     },
+     { // thread 1 expected keys
+       {42.4, 1ll << 61}, //
+       {24.2, 1ll << 59}  //
+     }};
+  // example-end keys-striped-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+}
+
+__global__ void sort_pairs_descending_blocked_to_striped()
+{
+  // example-begin pairs-striped-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0}  //
+     }};
+
+  int thread_values[2][3] = //
+    {{2, 1, 4},  // thread 0 values
+     {3, 5, 0}}; // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescendingBlockedToStriped(thread_keys[threadIdx.x],
+                                    thread_values[threadIdx.x],
+                                    decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+1.1, 2}, //
+       {+0.0, 3}  //
+     },
+     {
+       // thread 1 expected keys
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+       {-2.5, 5}  //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 2, 3},  // thread 0 values
+     {1, 4, 5}}; // thread 1 values
+  // example-end pairs-striped-descending
+  
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_descending_blocked_to_striped_bits()
+{
+  // example-begin pairs-striped-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{ // thread 0 keys
+       {24.2, 1ll << 62}, 
+       {42.4, 1ll << 61} 
+     },
+     { // thread 1 keys
+       {42.4, 1ll << 60}, 
+       {24.2, 1ll << 59}  
+     }};
+
+  int thread_values[2][2] = //
+    {{3, 2},  // thread 0 values
+     {1, 0}}; // thread 1 values
+
+  const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  -----------> 
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescendingBlockedToStriped(thread_keys[threadIdx.x],
+                                    thread_values[threadIdx.x],
+                                    decomposer_t{},
+                                    begin_bit,
+                                    end_bit);
+
+  custom_t expected_output[2][2] = //
+    {{ // thread 0 expected keys
+       {24.2, 1ll << 62}, //
+       {42.4, 1ll << 60}  //
+     },
+     { // thread 1 expected keys
+       {42.4, 1ll << 61}, //
+       {24.2, 1ll << 59}  //
+     }};
+
+  int expected_values[2][2] = //
+    {{3, 1},  // thread 0 values
+     {2, 0}}; // thread 1 values
+  // example-end pairs-striped-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+}
+
+TEST_CASE("Block radix sort works in some corner cases", "[radix][sort][block]")
+{
+  sort_keys<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
diff --git a/include/cub/test/catch2_test_block_reduce.cu b/include/cub/test/catch2_test_block_reduce.cu
new file mode 100644
index 0000000..f2ce06d
--- /dev/null
+++ b/include/cub/test/catch2_test_block_reduce.cu
@@ -0,0 +1,362 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_reduce.cuh>
+
+#include <thrust/host_vector.h>
+
+#include <numeric>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <cub::BlockReduceAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+__global__ void block_reduce_kernel(T *in, T *out, int valid_items, ActionT action)
+{
+  using block_reduce_t    = cub::BlockReduce<T, BlockDimX, Algorithm, BlockDimY, BlockDimZ>;
+  using storage_t         = typename block_reduce_t::TempStorage;
+
+  __shared__ storage_t storage;
+
+  T thread_data[ItemsPerThread];
+
+  const int tid = static_cast<int>(cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ));
+  const int thread_offset = tid * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_data[item] = idx < valid_items ? in[idx] : T();
+  }
+  __syncthreads();
+
+  block_reduce_t reduce(storage);
+
+  T aggregate = action(reduce, thread_data, valid_items);
+
+  if (tid == 0)
+  {
+    out[0] = aggregate;
+  }
+}
+
+template <cub::BlockReduceAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+void block_reduce(thrust::device_vector<T> &in, thrust::device_vector<T> &out, ActionT action)
+{
+  dim3 block_dims(BlockDimX, BlockDimY, BlockDimZ);
+
+  block_reduce_kernel<Algorithm, ItemsPerThread, BlockDimX, BlockDimY, BlockDimZ, T, ActionT>
+    <<<1, block_dims>>>(thrust::raw_pointer_cast(in.data()),
+                        thrust::raw_pointer_cast(out.data()),
+                        static_cast<int>(in.size()),
+                        action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+struct sum_partial_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT &reduce,
+                          T (&thread_data)[ItemsPerThread],
+                          int valid_items) const
+  {
+    return reduce.Sum(thread_data[0], valid_items);
+  }
+};
+
+struct sum_full_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT &reduce,
+                          T (&thread_data)[ItemsPerThread],
+                          int /* valid_items */) const
+  {
+    return reduce.Sum(thread_data);
+  }
+};
+
+struct max_partial_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT &reduce, T (&thread_data)[ItemsPerThread], int valid_items) const
+  {
+    return reduce.Reduce(thread_data[0], cub::Max{}, valid_items);
+  }
+};
+
+struct max_full_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT &reduce,
+                          T (&thread_data)[ItemsPerThread],
+                          int /* valid_items */) const
+  {
+    return reduce.Reduce(thread_data, cub::Max{});
+  }
+};
+
+using types = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t, float, double>;
+using vec_types = c2h::type_list<ulonglong4, uchar3, short2>;
+
+// %PARAM% TEST_DIM_X dimx 7:32:65:128
+// %PARAM% TEST_DIM_YZ dimyz 1:2
+
+using block_dim_xs           = c2h::enum_type_list<int, TEST_DIM_X>;
+using block_dim_yzs          = c2h::enum_type_list<int, TEST_DIM_YZ>;
+using items_per_thread       = c2h::enum_type_list<int, 1, 4>;
+using single_item_per_thread = c2h::enum_type_list<int, 1>;
+using algorithm =
+  c2h::enum_type_list<cub::BlockReduceAlgorithm,
+                      cub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING,
+                      cub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+                      cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int block_dim_x      = c2h::get<2, TestType>::value;
+  static constexpr int block_dim_y      = c2h::get<3, TestType>::value;
+  static constexpr int block_dim_z      = block_dim_y;
+  static constexpr int tile_size = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+
+  static constexpr cub::BlockReduceAlgorithm algorithm = c2h::get<4, TestType>::value;
+};
+
+CUB_TEST("Block reduce works with sum",
+         "[reduce][block]",
+         types,
+         items_per_thread,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(1);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  thrust::host_vector<type> h_in = d_in;
+  thrust::host_vector<type> h_reference(1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type &lhs, const type &rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<params::algorithm,
+                   params::items_per_thread,
+                   params::block_dim_x,
+                   params::block_dim_y,
+                   params::block_dim_z,
+                   type>(d_in, d_out, sum_full_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+CUB_TEST("Block reduce works with sum in partial tiles",
+         "[reduce][block]",
+         types,
+         single_item_per_thread,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(1);
+  thrust::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_in);
+
+  thrust::host_vector<type> h_in = d_in;
+  std::vector<type> h_reference(1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type &lhs, const type &rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<params::algorithm,
+                   params::items_per_thread,
+                   params::block_dim_x,
+                   params::block_dim_y,
+                   params::block_dim_z,
+                   type>(d_in, d_out, sum_partial_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+CUB_TEST("Block reduce works with custom op",
+         "[reduce][block]",
+         types,
+         items_per_thread,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(1);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  thrust::host_vector<type> h_in = d_in;
+  thrust::host_vector<type> h_reference(
+    1,
+    std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type &lhs, const type &rhs) {
+      return std::max(lhs, rhs);
+    }));
+
+  block_reduce<params::algorithm,
+                   params::items_per_thread,
+                   params::block_dim_x,
+                   params::block_dim_y,
+                   params::block_dim_z,
+                   type>(d_in, d_out, max_full_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+CUB_TEST("Block reduce works with custom op in partial tiles",
+         "[reduce][block]",
+         types,
+         single_item_per_thread,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(1);
+  thrust::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_in);
+
+  thrust::host_vector<type> h_in = d_in;
+  thrust::host_vector<type> h_reference(
+    1,
+    std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type &lhs, const type &rhs) {
+      return std::max(lhs, rhs);
+    }));
+
+  block_reduce<params::algorithm,
+                   params::items_per_thread,
+                   params::block_dim_x,
+                   params::block_dim_y,
+                   params::block_dim_z,
+                   type>(d_in, d_out, max_partial_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+CUB_TEST("Block reduce works with custom types",
+         "[reduce][block]",
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using type = c2h::custom_type_t<c2h::accumulateable_t,
+                                  c2h::equal_comparable_t>;
+
+  constexpr int items_per_thread = 1;
+  constexpr int block_dim_x = c2h::get<0, TestType>::value;
+  constexpr int block_dim_y = c2h::get<1, TestType>::value;
+  constexpr int block_dim_z = block_dim_y;
+  constexpr cub::BlockReduceAlgorithm algorithm = c2h::get<2, TestType>::value;
+
+  constexpr int tile_size = block_dim_x * block_dim_y * block_dim_z * items_per_thread;
+
+  thrust::device_vector<type> d_out(1);
+  thrust::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, tile_size))));
+  c2h::gen(CUB_SEED(10), d_in);
+
+  thrust::host_vector<type> h_in = d_in;
+  thrust::host_vector<type> h_reference(1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type &lhs, const type &rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<algorithm,
+                   items_per_thread,
+                   block_dim_x,
+                   block_dim_y,
+                   block_dim_z,
+                   type>(d_in, d_out, sum_partial_tile_op_t{});
+
+  REQUIRE(h_reference == d_out);
+}
+
+CUB_TEST("Block reduce works with vec types",
+         "[reduce][block]",
+         vec_types,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using type = c2h::get<0, TestType>;
+
+  constexpr int items_per_thread = 1;
+  constexpr int block_dim_x = c2h::get<1, TestType>::value;
+  constexpr int block_dim_y = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z = block_dim_y;
+  constexpr cub::BlockReduceAlgorithm algorithm = c2h::get<3, TestType>::value;
+
+  constexpr int tile_size = block_dim_x * block_dim_y * block_dim_z * items_per_thread;
+
+  thrust::device_vector<type> d_out(1);
+  thrust::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, tile_size))));
+  c2h::gen(CUB_SEED(10), d_in);
+
+  thrust::host_vector<type> h_in = d_in;
+  thrust::host_vector<type> h_reference(1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type &lhs, const type &rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<algorithm,
+                   items_per_thread,
+                   block_dim_x,
+                   block_dim_y,
+                   block_dim_z,
+                   type>(d_in, d_out, sum_partial_tile_op_t{});
+
+  REQUIRE(h_reference == d_out);
+}
+
diff --git a/include/cub/test/catch2_test_block_run_length_decode.cu b/include/cub/test/catch2_test_block_run_length_decode.cu
new file mode 100644
index 0000000..3bfc067
--- /dev/null
+++ b/include/cub/test/catch2_test_block_run_length_decode.cu
@@ -0,0 +1,638 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_run_length_decode.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+/******************************************************************************
+ * HELPER CLASS FOR RUN-LENGTH DECODING TESTS
+ ******************************************************************************/
+
+/**
+ * \brief Class template to facilitate testing the BlockRunLengthDecode algorithm for all its
+ * template parameter specialisations.
+ *
+ * \tparam ItemItT The item type being run-length decoded
+ * \tparam RunLengthsItT Iterator type providing the runs' lengths
+ * \tparam RUNS_PER_THREAD The number of runs that each thread is getting assigned to
+ * \tparam DECODED_ITEMS_PER_THREAD The number of run-length decoded items that each thread is 
+ *         decoding \tparam TEST_RELATIVE_OFFSETS_ Whether to also retrieve each decoded item's 
+ *         relative offset within its run \tparam TEST_RUN_OFFSETS_ Whether to pass in each run's 
+ *         offset instead of each run's length \tparam BLOCK_DIM_X The thread block length in 
+ *         threads along the X dimension
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template <typename ItemItT,
+          typename RunLengthsItT,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          bool TEST_RELATIVE_OFFSETS_,
+          bool TEST_RUN_OFFSETS_,
+          int BLOCK_DIM_X,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class AgentTestBlockRunLengthDecode
+{
+public:
+  constexpr static uint32_t BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  constexpr static uint32_t RUNS_PER_BLOCK    = RUNS_PER_THREAD * BLOCK_THREADS;
+  constexpr static bool TEST_RELATIVE_OFFSETS = TEST_RELATIVE_OFFSETS_;
+
+private:
+  using RunItemT   = cub::detail::value_t<ItemItT>;
+  using RunLengthT = cub::detail::value_t<RunLengthsItT>;
+
+  using BlockRunOffsetScanT =
+    cub::BlockScan<RunLengthT, BLOCK_DIM_X, cub::BLOCK_SCAN_RAKING, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  using BlockRunLengthDecodeT =
+    cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+
+  using BlockLoadRunItemT = cub::BlockLoad<RunItemT,
+                                           BLOCK_DIM_X,
+                                           RUNS_PER_THREAD,
+                                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                                           BLOCK_DIM_Y,
+                                           BLOCK_DIM_Z>;
+
+  using BlockLoadRunLengthsT = cub::BlockLoad<RunLengthT,
+                                              BLOCK_DIM_X,
+                                              RUNS_PER_THREAD,
+                                              cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                                              BLOCK_DIM_Y,
+                                              BLOCK_DIM_Z>;
+
+  using BlockStoreDecodedItemT = cub::BlockStore<RunItemT,
+                                                 BLOCK_DIM_X,
+                                                 DECODED_ITEMS_PER_THREAD,
+                                                 cub::BLOCK_STORE_WARP_TRANSPOSE,
+                                                 BLOCK_DIM_Y,
+                                                 BLOCK_DIM_Z>;
+
+  using BlockStoreRelativeOffsetT = cub::BlockStore<RunLengthT,
+                                                    BLOCK_DIM_X,
+                                                    DECODED_ITEMS_PER_THREAD,
+                                                    cub::BLOCK_STORE_WARP_TRANSPOSE,
+                                                    BLOCK_DIM_Y,
+                                                    BLOCK_DIM_Z>;
+
+  __device__ __forceinline__ BlockRunLengthDecodeT
+  InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
+                           RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                           RunLengthT &decoded_size,
+                           cub::Int2Type<true> /*test_run_offsets*/)
+  {
+    RunLengthT run_offsets[RUNS_PER_THREAD];
+    BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage)
+      .ExclusiveSum(run_lengths, run_offsets, decoded_size);
+
+    // Ensure temporary shared memory can be repurposed
+    cub::CTA_SYNC();
+
+    // Construct BlockRunLengthDecode and initialize with the run offsets
+    return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage,
+                                 unique_items,
+                                 run_offsets);
+  }
+
+  __device__ __forceinline__ BlockRunLengthDecodeT
+  InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
+                           RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                           RunLengthT &decoded_size,
+                           cub::Int2Type<false> /*test_run_offsets*/)
+  {
+    // Construct BlockRunLengthDecode and initialize with the run lengths
+    return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage,
+                                 unique_items,
+                                 run_lengths,
+                                 decoded_size);
+  }
+
+  __device__ __forceinline__ void LoadRuns(ItemItT d_block_unique_items,
+                                           RunLengthsItT d_block_run_lengths,
+                                           RunItemT (&unique_items)[RUNS_PER_THREAD],
+                                           RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                           size_t num_valid_items)
+  {
+    if (num_valid_items < RUNS_PER_BLOCK)
+    {
+      BlockLoadRunItemT(temp_storage.load_uniques_storage)
+        .Load(d_block_unique_items, unique_items, num_valid_items);
+    }
+    else
+    {
+      BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items);
+    }
+
+    // Ensure BlockLoad's temporary shared memory can be repurposed
+    cub::CTA_SYNC();
+
+    // Load this block's tile of run lengths
+    if (num_valid_items < RUNS_PER_BLOCK)
+      BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
+        .Load(d_block_run_lengths, run_lengths, num_valid_items, static_cast<RunLengthT>(0));
+    else
+      BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
+        .Load(d_block_run_lengths, run_lengths);
+
+    // Ensure temporary shared memory can be repurposed
+    cub::CTA_SYNC();
+  }
+
+public:
+  union TempStorage
+  {
+    typename BlockLoadRunItemT::TempStorage load_uniques_storage;
+    typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage;
+    cub::detail::
+      conditional_t<TEST_RUN_OFFSETS_, typename BlockRunOffsetScanT::TempStorage, cub::NullType>
+        run_offsets_scan_storage;
+    struct
+    {
+      typename BlockRunLengthDecodeT::TempStorage run_length_decode_storage;
+      typename BlockStoreDecodedItemT::TempStorage store_decoded_runs_storage;
+      typename BlockStoreRelativeOffsetT::TempStorage store_relative_offsets;
+    } decode;
+  };
+
+  TempStorage &temp_storage;
+
+  __device__ __forceinline__ AgentTestBlockRunLengthDecode(TempStorage &temp_storage)
+      : temp_storage(temp_storage)
+  {}
+
+  /**
+   * \brief Loads the given block (or tile) of runs, and computes their "decompressed" (run-length
+   * decoded) size.
+   */
+  __device__ __forceinline__ uint32_t GetDecodedSize(ItemItT d_block_unique_items,
+                                                     RunLengthsItT d_block_run_lengths,
+                                                     size_t num_valid_runs)
+  {
+    // Load this block's tile of encoded runs
+    RunItemT unique_items[RUNS_PER_THREAD];
+    RunLengthT run_lengths[RUNS_PER_THREAD];
+    LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
+
+    // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
+    // "decompressed" size)
+    uint32_t decoded_size = 0U;
+    BlockRunLengthDecodeT run_length_decode =
+      InitBlockRunLengthDecode(unique_items,
+                               run_lengths,
+                               decoded_size,
+                               cub::Int2Type<TEST_RUN_OFFSETS_>());
+    return decoded_size;
+  }
+
+  /**
+   * \brief Loads the given block (or tile) of runs, run-length decodes them, and writes the results
+   * to \p d_block_decoded_out.
+   */
+  template <typename UniqueItemOutItT, typename RelativeOffsetOutItT>
+  __device__ __forceinline__ uint32_t WriteDecodedRuns(ItemItT d_block_unique_items,
+                                                       RunLengthsItT d_block_run_lengths,
+                                                       UniqueItemOutItT d_block_decoded_out,
+                                                       RelativeOffsetOutItT d_block_rel_out,
+                                                       size_t num_valid_runs)
+  {
+    // Load this block's tile of encoded runs
+    RunItemT unique_items[RUNS_PER_THREAD];
+    RunLengthT run_lengths[RUNS_PER_THREAD];
+    LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
+
+    // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
+    // "decompressed" size)
+    uint32_t decoded_size = 0U;
+    BlockRunLengthDecodeT run_length_decode =
+      InitBlockRunLengthDecode(unique_items,
+                               run_lengths,
+                               decoded_size,
+                               cub::Int2Type<TEST_RUN_OFFSETS_>());
+
+    // Run-length decode ("decompress") the runs into a window buffer of limited size. This is
+    // repeated until all runs have been decoded.
+    uint32_t decoded_window_offset = 0U;
+    while (decoded_window_offset < decoded_size)
+    {
+      RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+      RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+
+      // The number of decoded items that are valid within this window (aka pass) of run-length
+      // decoding
+      uint32_t num_valid_items = decoded_size - decoded_window_offset;
+      run_length_decode.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+      BlockStoreDecodedItemT(temp_storage.decode.store_decoded_runs_storage)
+        .Store(d_block_decoded_out + decoded_window_offset, decoded_items, num_valid_items);
+
+      if (TEST_RELATIVE_OFFSETS)
+      {
+        BlockStoreRelativeOffsetT(temp_storage.decode.store_relative_offsets)
+          .Store(d_block_rel_out + decoded_window_offset, relative_offsets, num_valid_items);
+      }
+
+      decoded_window_offset += DECODED_ITEMS_PER_THREAD * BLOCK_THREADS;
+    }
+    return decoded_size;
+  }
+};
+
+/******************************************************************************
+ * [STAGE 1] RUN-LENGTH DECODING TEST KERNEL
+ ******************************************************************************/
+template <typename AgentTestBlockRunLengthDecode,
+          typename ItemItT,
+          typename RunLengthsItT,
+          typename OffsetT,
+          typename DecodedSizesOutT>
+__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
+  void BlockRunLengthDecodeGetSizeKernel(const ItemItT d_unique_items,
+                                         const RunLengthsItT d_run_lengths,
+                                         const OffsetT num_runs,
+                                         DecodedSizesOutT d_decoded_sizes)
+{
+  constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
+
+  __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
+
+  OffsetT block_offset   = blockIdx.x * RUNS_PER_BLOCK;
+  OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset)
+                                                                       : RUNS_PER_BLOCK;
+
+  AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
+  uint64_t num_decoded_items = run_length_decode_agent.GetDecodedSize(d_unique_items + block_offset,
+                                                                      d_run_lengths + block_offset,
+                                                                      num_valid_runs);
+
+  d_decoded_sizes[blockIdx.x] = num_decoded_items;
+}
+
+/******************************************************************************
+ * [STAGE 2] RUN-LENGTH DECODING TEST KERNEL
+ ******************************************************************************/
+template <typename AgentTestBlockRunLengthDecode,
+          typename ItemItT,
+          typename RunLengthsItT,
+          typename DecodedSizesOutT,
+          typename OffsetT,
+          typename DecodedItemsOutItT,
+          typename RelativeOffsetOutItT>
+__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
+  void BlockRunLengthDecodeTestKernel(const ItemItT d_unique_items,
+                                      const RunLengthsItT d_run_lengths,
+                                      const DecodedSizesOutT d_decoded_offsets,
+                                      const OffsetT num_runs,
+                                      DecodedItemsOutItT d_decoded_items,
+                                      RelativeOffsetOutItT d_relative_offsets)
+
+{
+  constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
+
+  __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
+
+  OffsetT block_offset   = blockIdx.x * RUNS_PER_BLOCK;
+  OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset)
+                                                                       : RUNS_PER_BLOCK;
+
+  AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
+  run_length_decode_agent.WriteDecodedRuns(d_unique_items + block_offset,
+                                           d_run_lengths + block_offset,
+                                           d_decoded_items + d_decoded_offsets[blockIdx.x],
+                                           d_relative_offsets + d_decoded_offsets[blockIdx.x],
+                                           num_valid_runs);
+}
+
+struct ModOp
+{
+  using T = uint32_t;
+  __host__ __device__ __forceinline__ T operator()(const T &x) const { return 1 + (x % 100); }
+};
+
+template <uint32_t RUNS_PER_THREAD,
+          uint32_t DECODED_ITEMS_PER_THREAD,
+          uint32_t BLOCK_DIM_X,
+          uint32_t BLOCK_DIM_Y,
+          uint32_t BLOCK_DIM_Z,
+          bool TEST_RUN_OFFSETS,
+          bool TEST_RELATIVE_OFFSETS>
+void TestAlgorithmSpecialisation()
+{
+  constexpr uint32_t THREADS_PER_BLOCK = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  constexpr uint32_t RUNS_PER_BLOCK    = RUNS_PER_THREAD * THREADS_PER_BLOCK;
+
+  using RunItemT   = float;
+  using RunLengthT = uint32_t;
+  using ItemItT    = cub::CountingInputIterator<RunItemT>;
+  using RunLengthsItT =
+    cub::TransformInputIterator<RunLengthT, ModOp, cub::CountingInputIterator<RunLengthT>>;
+
+  ItemItT d_unique_items(1000U);
+  RunLengthsItT d_run_lengths(cub::CountingInputIterator<RunLengthT>(0), ModOp{});
+
+  constexpr uint32_t num_runs   = 10000;
+  constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK;
+
+  size_t temp_storage_bytes      = 0ULL;
+  void *temp_storage             = nullptr;
+  uint32_t *h_num_decoded_total  = nullptr;
+  uint32_t *d_decoded_sizes      = nullptr;
+  uint32_t *d_decoded_offsets    = nullptr;
+  RunItemT *d_decoded_out        = nullptr;
+  RunLengthT *d_relative_offsets = nullptr;
+  RunItemT *h_decoded_out        = nullptr;
+  RunLengthT *h_relative_offsets = nullptr;
+
+  using AgentTestBlockRunLengthDecodeT = AgentTestBlockRunLengthDecode<ItemItT,
+                                                                       RunLengthsItT,
+                                                                       RUNS_PER_THREAD,
+                                                                       DECODED_ITEMS_PER_THREAD,
+                                                                       TEST_RELATIVE_OFFSETS,
+                                                                       TEST_RUN_OFFSETS,
+                                                                       THREADS_PER_BLOCK,
+                                                                       1,
+                                                                       1>;
+
+  enum : uint32_t
+  {
+    TIMER_SIZE_BEGIN = 0,
+    TIMER_SIZE_END,
+    TIMER_DECODE_BEGIN,
+    TIMER_DECODE_END,
+    NUM_TIMERS,
+  };
+
+  // Get temporary storage requirements for the scan (for computing offsets for the per-block
+  // run-length decoded items)
+  cub::DeviceScan::InclusiveSum(nullptr,
+                                temp_storage_bytes,
+                                d_decoded_sizes,
+                                d_decoded_offsets,
+                                num_blocks);
+
+  // Allocate device memory
+  CubDebugExit(cudaMalloc(&temp_storage, temp_storage_bytes));
+  CubDebugExit(cudaMalloc(&d_decoded_sizes, num_blocks * sizeof(*d_decoded_sizes)));
+  // Allocate for the exclusive sum PLUS the overall aggregate
+  CubDebugExit(cudaMalloc(&d_decoded_offsets, (num_blocks + 1) * sizeof(*d_decoded_offsets)));
+  CubDebugExit(cudaMallocHost(&h_num_decoded_total, sizeof(*h_num_decoded_total)));
+
+  // Get the per-block number of items being decoded (i-th thread block writing size to
+  // d_decoded_sizes[i])
+  BlockRunLengthDecodeGetSizeKernel<AgentTestBlockRunLengthDecodeT>
+    <<<num_blocks, THREADS_PER_BLOCK, 0U>>>(d_unique_items,
+                                            d_run_lengths,
+                                            num_runs,
+                                            d_decoded_sizes);
+
+  // Compute offsets for the runs decoded by each block (exclusive sum + aggregate)
+  CubDebugExit(cudaMemsetAsync(d_decoded_offsets, 0, sizeof(d_decoded_offsets[0])));
+  CubDebugExit(cub::DeviceScan::InclusiveSum(temp_storage,
+                                             temp_storage_bytes,
+                                             d_decoded_sizes,
+                                             &d_decoded_offsets[1],
+                                             num_blocks));
+
+  // Copy the total decoded size to CPU in order to allocate just the right amount of device memory
+  CubDebugExit(cudaMemcpy(h_num_decoded_total,
+                          &d_decoded_offsets[num_blocks],
+                          sizeof(*h_num_decoded_total),
+                          cudaMemcpyDeviceToHost));
+
+  // Allocate device memory for the run-length decoded output
+  CubDebugExit(cudaMallocHost(&h_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
+  CubDebugExit(cudaMalloc(&d_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    CubDebugExit(cudaMalloc(&d_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
+    CubDebugExit(cudaMallocHost(&h_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
+  }
+
+  // Perform the block-wise run-length decoding (each block taking its offset from
+  // d_decoded_offsets)
+  BlockRunLengthDecodeTestKernel<AgentTestBlockRunLengthDecodeT>
+    <<<num_blocks, THREADS_PER_BLOCK, 0U>>>(d_unique_items,
+                                            d_run_lengths,
+                                            d_decoded_offsets,
+                                            num_runs,
+                                            d_decoded_out,
+                                            d_relative_offsets);
+
+  // Copy back results for verification
+  CubDebugExit(cudaMemcpy(h_decoded_out,
+                          d_decoded_out,
+                          (*h_num_decoded_total) * sizeof(*h_decoded_out),
+                          cudaMemcpyDeviceToHost));
+
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    // Copy back the relative offsets
+    CubDebugExit(cudaMemcpy(h_relative_offsets,
+                            d_relative_offsets,
+                            (*h_num_decoded_total) * sizeof(*h_relative_offsets),
+                            cudaMemcpyDeviceToHost));
+  }
+
+  // Generate host-side run-length decoded data for verification
+  std::vector<std::pair<RunItemT, RunLengthT>> host_golden;
+  host_golden.reserve(*h_num_decoded_total);
+  for (uint32_t run = 0; run < num_runs; run++)
+  {
+    for (RunLengthT i = 0; i < d_run_lengths[run]; i++)
+    {
+      host_golden.push_back({d_unique_items[run], i});
+    }
+  }
+
+  // Verify the total run-length decoded size is correct
+  REQUIRE(host_golden.size() == h_num_decoded_total[0]);
+
+  // Verify the run-length decoded data is correct
+  bool cmp_eq = true;
+  for (uint32_t i = 0; i < host_golden.size(); i++)
+  {
+    if (host_golden[i].first != h_decoded_out[i])
+    {
+      FAIL("Mismatch at #" << i << ": CPU item: " << host_golden[i].first
+           << ", GPU: " << h_decoded_out[i] << "\n");
+      cmp_eq = false;
+    }
+    if (TEST_RELATIVE_OFFSETS)
+    {
+      if (host_golden[i].second != h_relative_offsets[i])
+      {
+        FAIL("Mismatch of relative offset at #" << i
+             << ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i]
+             << "; relative offsets: CPU: " << host_golden[i].second
+             << ", GPU: " << h_relative_offsets[i] << "\n");
+        cmp_eq = false;
+        break;
+      }
+    }
+  }
+
+  REQUIRE(cmp_eq == true);
+
+  // Clean up memory allocations
+  CubDebugExit(cudaFree(temp_storage));
+  CubDebugExit(cudaFree(d_decoded_sizes));
+  CubDebugExit(cudaFree(d_decoded_offsets));
+  CubDebugExit(cudaFree(d_decoded_out));
+  CubDebugExit(cudaFreeHost(h_num_decoded_total));
+  CubDebugExit(cudaFreeHost(h_decoded_out));
+
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    CubDebugExit(cudaFree(d_relative_offsets));
+    CubDebugExit(cudaFreeHost(h_relative_offsets));
+  }
+}
+
+constexpr bool DO_TEST_RELATIVE_OFFSETS     = true;
+constexpr bool DO_NOT_TEST_RELATIVE_OFFSETS = false;
+
+constexpr bool TEST_WITH_RUN_OFFSETS = true;
+constexpr bool TEST_WITH_RUN_LENGTHS = false;
+
+template <int RunsPerThread,
+          int DecodedItemsPerThread,
+          int BlockDimX,
+          int BlockDimY = 1,
+          int BlockDimZ = 1>
+struct params_t
+{
+  static constexpr int runs_per_thread          = RunsPerThread;
+  static constexpr int decoded_items_per_thread = DecodedItemsPerThread;
+  static constexpr int block_dim_x              = BlockDimX;
+  static constexpr int block_dim_y              = BlockDimY;
+  static constexpr int block_dim_z              = BlockDimZ;
+};
+
+CUB_TEST_LIST("Block Run Length Decode works with run lengths and offsets relative to each run",
+              "[rld][block]",
+              params_t<1, 1, 64>,
+              params_t<1, 3, 32, 2, 3>,
+              params_t<1, 1, 128>,
+              params_t<1, 8, 128>,
+              params_t<3, 1, 256>,
+              params_t<1, 8, 256>,
+              params_t<8, 1, 256>,
+              params_t<1, 1, 256>,
+              params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_LENGTHS,
+                              DO_TEST_RELATIVE_OFFSETS>();
+}
+
+CUB_TEST_LIST("Block Run Length Decode works with run lengths and performs normal run-length "
+              "decoding",
+              "[rld][block]",
+              params_t<1, 1, 64>,
+              params_t<1, 3, 32, 2, 3>,
+              params_t<1, 1, 128>,
+              params_t<1, 8, 128>,
+              params_t<3, 1, 256>,
+              params_t<1, 8, 256>,
+              params_t<8, 1, 256>,
+              params_t<1, 1, 256>,
+              params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_LENGTHS,
+                              DO_NOT_TEST_RELATIVE_OFFSETS>();
+}
+
+CUB_TEST_LIST("Block Run Length Decode works with run offsets and generates offsets relative to "
+              "each run",
+              "[rld][block]",
+              params_t<1, 1, 64>,
+              params_t<1, 3, 32, 2, 3>,
+              params_t<1, 1, 128>,
+              params_t<1, 8, 128>,
+              params_t<3, 1, 256>,
+              params_t<1, 8, 256>,
+              params_t<8, 1, 256>,
+              params_t<1, 1, 256>,
+              params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_OFFSETS,
+                              DO_TEST_RELATIVE_OFFSETS>();
+}
+
+CUB_TEST_LIST("Block Run Length Decode works with run offsets and performs normal run-length "
+              "decoding",
+              "[rld][block]",
+              params_t<1, 1, 64>,
+              params_t<1, 3, 32, 2, 3>,
+              params_t<1, 1, 128>,
+              params_t<1, 8, 128>,
+              params_t<3, 1, 256>,
+              params_t<1, 8, 256>,
+              params_t<8, 1, 256>,
+              params_t<1, 1, 256>,
+              params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_OFFSETS,
+                              DO_NOT_TEST_RELATIVE_OFFSETS>();
+}
+
diff --git a/include/cub/test/catch2_test_block_scan.cu b/include/cub/test/catch2_test_block_scan.cu
new file mode 100644
index 0000000..3e613d0
--- /dev/null
+++ b/include/cub/test/catch2_test_block_scan.cu
@@ -0,0 +1,536 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_scan.cuh>
+
+#include <thrust/host_vector.h>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <cub::BlockScanAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+__global__ void block_scan_kernel(T *in, T *out, ActionT action)
+{
+  using block_scan_t      = cub::BlockScan<T, BlockDimX, Algorithm, BlockDimY, BlockDimZ>;
+  using storage_t         = typename block_scan_t::TempStorage;
+
+  __shared__ storage_t storage;
+
+  T thread_data[ItemsPerThread];
+
+  const int tid           = static_cast<int>(cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ));
+  const int thread_offset = tid * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_data[item] = in[idx];
+  }
+  __syncthreads();
+
+  block_scan_t scan(storage);
+
+  action(scan, thread_data);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+    out[idx]      = thread_data[item];
+  }
+}
+
+template <cub::BlockScanAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+void block_scan(thrust::device_vector<T> &in, thrust::device_vector<T> &out, ActionT action)
+{
+  dim3 block_dims(BlockDimX, BlockDimY, BlockDimZ);
+
+  block_scan_kernel<Algorithm, ItemsPerThread, BlockDimX, BlockDimY, BlockDimZ, T, ActionT>
+    <<<1, block_dims>>>(thrust::raw_pointer_cast(in.data()),
+                        thrust::raw_pointer_cast(out.data()),
+                        action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+enum class scan_mode
+{
+  exclusive,
+  inclusive
+};
+
+template <scan_mode Mode>
+struct sum_op_t
+{
+  template <int ItemsPerThread, class BlockScanT, class T>
+  __device__ void operator()(BlockScanT &scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data);
+    }
+    else 
+    {
+      scan.InclusiveSum(thread_data, thread_data);
+    }
+  }
+};
+
+template <scan_mode Mode>
+struct min_op_t
+{
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT &scan, int (&thread_data)[ItemsPerThread]) const
+  {
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+    else 
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct sum_aggregate_op_t
+{
+  int m_target_thread_id;
+  T *m_d_block_aggregate;
+
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT &scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    T block_aggregate{};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data, block_aggregate);
+    }
+    else 
+    {
+      scan.InclusiveSum(thread_data, thread_data, block_aggregate);
+    }
+
+    const int tid = static_cast<int>(cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z));
+
+    if (tid == m_target_thread_id)
+    {
+      *m_d_block_aggregate = block_aggregate;
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct sum_prefix_op_t
+{
+  T m_prefix;
+
+  struct block_prefix_op_t
+  {
+    int linear_tid;
+    T prefix;
+
+    __device__ block_prefix_op_t(int linear_tid, T prefix)
+        : linear_tid(linear_tid)
+        , prefix(prefix)
+    {}
+
+    __device__ T operator()(T block_aggregate)
+    {
+      T retval = (linear_tid == 0) ? prefix : T{};
+      prefix   = prefix + block_aggregate;
+      return retval;
+    }
+  };
+
+
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT &scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    const int tid = static_cast<int>(cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z));
+    block_prefix_op_t prefix_op{tid, m_prefix};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data, prefix_op);
+    }
+    else 
+    {
+      scan.InclusiveSum(thread_data, thread_data, prefix_op);
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_prefix_op_t
+{
+  T m_prefix;
+  static constexpr T min_identity = std::numeric_limits<T>::max();
+
+  struct block_prefix_op_t
+  {
+    int linear_tid;
+    T prefix;
+
+    __device__ block_prefix_op_t(int linear_tid, T prefix)
+        : linear_tid(linear_tid)
+        , prefix(prefix)
+    {}
+
+    __device__ T operator()(T block_aggregate)
+    {
+      T retval = (linear_tid == 0) ? prefix : min_identity;
+      prefix   = cub::Min{}(prefix, block_aggregate);
+      return retval;
+    }
+  };
+
+
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT &scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    const int tid = static_cast<int>(cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z));
+    block_prefix_op_t prefix_op{tid, m_prefix};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{}, prefix_op);
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{}, prefix_op);
+    }
+  }
+};
+
+template <class T, class ScanOpT>
+T host_scan(scan_mode mode, thrust::host_vector<T> &result, ScanOpT scan_op, T initial_value = T{})
+{
+  if (result.empty()) 
+  {
+    return {};
+  }
+
+  T accumulator = static_cast<T>(scan_op(initial_value, result[0]));
+  T block_accumulator = result[0];
+
+  if (mode == scan_mode::exclusive)
+  {
+    result[0] = initial_value;
+
+    for (std::size_t i = 1; i < result.size(); i++)
+    {
+      T tmp       = result[i];
+      result[i]   = accumulator;
+      accumulator = static_cast<T>(scan_op(accumulator, tmp));
+      block_accumulator = static_cast<T>(scan_op(block_accumulator, tmp));
+    }
+  }
+  else
+  {
+    result[0] = accumulator;
+
+    for (std::size_t i = 1; i < result.size(); i++)
+    {
+      accumulator = static_cast<T>(scan_op(accumulator, result[i]));
+      block_accumulator = static_cast<T>(scan_op(block_accumulator, result[i]));
+      result[i] = accumulator;
+    }
+  }
+
+  return block_accumulator;
+}
+
+// %PARAM% ALGO_TYPE alg 0:1:2
+// %PARAM% TEST_MODE mode 0:1
+
+using types            = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using vec_types        = c2h::type_list<ulonglong4, uchar3, short2>;
+using block_dim_x      = c2h::enum_type_list<int, 17, 32, 65, 96>;
+using block_dim_yz     = c2h::enum_type_list<int, 1, 2>;
+using items_per_thread = c2h::enum_type_list<int, 1, 9>;
+using algorithms       = c2h::enum_type_list<cub::BlockScanAlgorithm,
+                                       cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING,
+                                       cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS,
+                                       cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE>;
+using algorithm = c2h::enum_type_list<cub::BlockScanAlgorithm, c2h::get<ALGO_TYPE, algorithms>::value>;
+
+#if TEST_MODE == 0
+using modes = c2h::enum_type_list<scan_mode, scan_mode::inclusive>;
+#else
+using modes = c2h::enum_type_list<scan_mode, scan_mode::exclusive>;
+#endif
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int block_dim_x      = c2h::get<1, TestType>::value;
+  static constexpr int block_dim_y      = c2h::get<2, TestType>::value;
+  static constexpr int block_dim_z      = block_dim_y;
+  static constexpr int items_per_thread = c2h::get<3, TestType>::value;
+  static constexpr int tile_size = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+
+  static constexpr cub::BlockScanAlgorithm algorithm = c2h::get<4, TestType>::value;
+  static constexpr scan_mode mode                    = c2h::get<5, TestType>::value;
+};
+
+CUB_TEST("Block scan works with sum",
+         "[scan][block]",
+         types,
+         block_dim_x,
+         block_dim_yz,
+         items_per_thread,
+         algorithm,
+         modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  block_scan<params::algorithm,
+             params::items_per_thread,
+             params::block_dim_x,
+             params::block_dim_y,
+             params::block_dim_z>(d_in, d_out, sum_op_t<params::mode>{});
+
+
+  thrust::host_vector<type> h_out = d_in;
+  host_scan(params::mode, h_out, std::plus<type>{});
+
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+CUB_TEST("Block scan works with vec types", "[scan][block]", vec_types, algorithm, modes)
+{
+  constexpr int items_per_thread = 3;
+  constexpr int block_dim_x      = 256;
+  constexpr int block_dim_y      = 1;
+  constexpr int block_dim_z      = 1;
+  constexpr int tile_size        = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<1, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<2, TestType>::value;
+
+  using type = typename c2h::get<0, TestType>;
+
+  thrust::device_vector<type> d_out(tile_size);
+  thrust::device_vector<type> d_in(tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(d_in,
+                                                                                 d_out,
+                                                                                 sum_op_t<mode>{});
+
+  thrust::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, std::plus<type>{});
+
+  REQUIRE(h_out == d_out);
+}
+
+CUB_TEST("Block scan works with custom types", "[scan][block]", algorithm, modes)
+{
+  constexpr int items_per_thread = 3;
+  constexpr int block_dim_x      = 256;
+  constexpr int block_dim_y      = 1;
+  constexpr int block_dim_z      = 1;
+  constexpr int tile_size        = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+
+  thrust::device_vector<type> d_out(tile_size);
+  thrust::device_vector<type> d_in(tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(d_in,
+                                                                                 d_out,
+                                                                                 sum_op_t<mode>{});
+
+  thrust::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, std::plus<type>{});
+
+  REQUIRE(h_out == d_out);
+}
+
+CUB_TEST("Block scan returns valid block aggregate",
+         "[scan][block]",
+         algorithm,
+         modes,
+         block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, threads_in_block - 1)));
+
+  thrust::device_vector<type> d_block_aggregate(1);
+  thrust::device_vector<type> d_out(tile_size);
+  thrust::device_vector<type> d_in(tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in,
+    d_out,
+    sum_aggregate_op_t<type, mode>{target_thread_id,
+                                   thrust::raw_pointer_cast(d_block_aggregate.data())});
+
+  thrust::host_vector<type> h_out = d_in;
+  type block_aggregate = host_scan(mode, h_out, std::plus<type>{});
+
+  REQUIRE(h_out == d_out);
+  REQUIRE(block_aggregate == d_block_aggregate[0]);
+}
+
+CUB_TEST("Block scan supports prefix op",
+         "[scan][block]",
+         algorithm,
+         modes,
+         block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  const type prefix = GENERATE_COPY(take(2, random(0, tile_size)));
+
+  thrust::device_vector<type> d_out(tile_size);
+  thrust::device_vector<type> d_in(tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in,
+    d_out,
+    sum_prefix_op_t<type, mode>{prefix});
+
+  thrust::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, std::plus<type>{}, prefix);
+
+  REQUIRE(h_out == d_out);
+}
+
+CUB_TEST("Block scan supports custom scan op",
+         "[scan][block]",
+         algorithm,
+         modes,
+         block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  thrust::device_vector<type> d_out(tile_size);
+  thrust::device_vector<type> d_in(tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+  d_in[0] = INT_MIN;
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in,
+    d_out,
+    min_op_t<mode>{});
+
+  thrust::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, [](type l, type r) { return std::min(l, r); }, INT_MIN );
+
+  REQUIRE(h_out == d_out);
+}
+
+CUB_TEST("Block scan supports prefix op and custom scan op",
+         "[scan][block]",
+         algorithm,
+         modes,
+         block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  const type prefix = GENERATE_COPY(take(2, random(0, tile_size)));
+
+  thrust::device_vector<type> d_out(tile_size);
+  thrust::device_vector<type> d_in(tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in,
+    d_out,
+    min_prefix_op_t<type, mode>{prefix});
+
+  thrust::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, [] (type a, type b) { return std::min(a, b); }, prefix);
+
+  REQUIRE(h_out == d_out);
+}
+
diff --git a/include/cub/test/catch2_test_block_shuffle.cu b/include/cub/test/catch2_test_block_shuffle.cu
new file mode 100644
index 0000000..a43240e
--- /dev/null
+++ b/include/cub/test/catch2_test_block_shuffle.cu
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockMergeSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+#include <cub/block/block_shuffle.cuh>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          int ItemsPerThread,   
+          class T,  
+          class ActionT>
+__global__ void block_shuffle_kernel(T *data, ActionT action)
+{
+  using block_shuffle_t = cub::BlockShuffle<T, BlockDimX, BlockDimY, BlockDimZ>;
+  using temp_storage_t  = typename block_shuffle_t::TempStorage;
+
+  __shared__ temp_storage_t temp_storage;
+
+  T thread_data[ItemsPerThread];
+
+  data += cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ) * ItemsPerThread;
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_data[item] = data[item];
+  }
+  __syncthreads();
+
+  block_shuffle_t block_shuffle(temp_storage);
+  action(block_shuffle, thread_data);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    data[item] = thread_data[item];
+  }
+}
+
+struct up_op_t 
+{
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void
+  operator()(BlockShuffleT &block_shuffle,
+             T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Up(thread_data, thread_data);
+  }
+};
+
+struct offset_op_t 
+{
+  int m_distance;
+
+  __host__ offset_op_t(int distance)
+    : m_distance(distance)
+  {}
+
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void
+  operator()(BlockShuffleT &block_shuffle,
+             T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Offset(thread_data[0], thread_data[0], m_distance);
+  }
+};
+
+struct rotate_op_t 
+{
+  unsigned int m_distance;
+
+  __host__ rotate_op_t(unsigned int distance)
+    : m_distance(distance)
+  {}
+
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void
+  operator()(BlockShuffleT &block_shuffle,
+             T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Rotate(thread_data[0], thread_data[0], m_distance);
+  }
+};
+
+template <class T>
+struct up_with_suffix_op_t 
+{
+  int m_target_thread_id;
+  T * m_d_suffix_ptr;
+
+  __host__ up_with_suffix_op_t(
+      int target_thread_id,
+      T *d_suffix_ptr)
+      : m_target_thread_id(target_thread_id)
+      , m_d_suffix_ptr(d_suffix_ptr)
+  {}
+
+  template <class BlockShuffleT, int ItemsPerThread>
+  __device__ void
+  operator()(BlockShuffleT &block_shuffle,
+             T (&thread_data)[ItemsPerThread]) const
+  {
+    T suffix{};
+
+    block_shuffle.Up(thread_data, thread_data, suffix);
+
+    if (cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z) == m_target_thread_id)
+    {
+      m_d_suffix_ptr[0] = suffix;
+    }
+  }
+};
+
+struct down_op_t 
+{
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void
+  operator()(BlockShuffleT &block_shuffle,
+             T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Down(thread_data, thread_data);
+  }
+};
+
+template <class T>
+struct down_with_prefix_op_t 
+{
+  int m_target_thread_id;
+  T * m_d_prefix_ptr;
+
+  __host__ down_with_prefix_op_t(
+      int target_thread_id,
+      T *d_prefix_ptr)
+      : m_target_thread_id(target_thread_id)
+      , m_d_prefix_ptr(d_prefix_ptr)
+  {}
+
+  template <class BlockShuffleT, int ItemsPerThread>
+  __device__ void
+  operator()(BlockShuffleT &block_shuffle,
+             T (&thread_data)[ItemsPerThread]) const
+  {
+    T prefix{};
+
+    block_shuffle.Down(thread_data, thread_data, prefix);
+
+    if (cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z) == m_target_thread_id)
+    {
+      m_d_prefix_ptr[0] = prefix;
+    }
+  }
+};
+
+template <int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+void block_shuffle(
+    thrust::device_vector<T> &data, 
+    ActionT action)
+{
+  dim3 block(BlockDimX, BlockDimY, BlockDimZ);
+  block_shuffle_kernel<BlockDimX, BlockDimY, BlockDimZ, ItemsPerThread>
+    <<<1, block>>>(
+        thrust::raw_pointer_cast(data.data()),
+        action);
+
+  REQUIRE( cudaSuccess == cudaPeekAtLastError() );
+  REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
+}
+
+// %PARAM% MULTI_DIM mdim 0:1
+// %PARAM% DIM_IDX dim_idx 0:1:2
+
+#if MULTI_DIM
+using block_dim_xs  = c2h::enum_type_list<int, 7, 32, 64>;
+using block_dim_yz  = c2h::enum_type_list<int, 2>;
+#else
+using block_dim_xs  = c2h::enum_type_list<int, 64, 512, 1024>;
+using block_dim_yz  = c2h::enum_type_list<int, 1>;
+#endif
+
+using block_dim_x   = c2h::enum_type_list<int, c2h::get<DIM_IDX, block_dim_xs>::value>;
+
+using types = c2h::type_list<std::int32_t, std::int64_t>;
+using items_per_thread = c2h::enum_type_list<int, 1, 2, 15>;
+using single_item_per_thread = c2h::enum_type_list<int, 1>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int block_dim_x      = c2h::get<2, TestType>::value;
+  static constexpr int block_dim_y      = c2h::get<3, TestType>::value;
+  static constexpr int block_dim_z      = block_dim_y;
+  static constexpr int threads_in_block = block_dim_x * block_dim_y * block_dim_z;
+  static constexpr int tile_size = items_per_thread * threads_in_block;
+};
+
+
+CUB_TEST("Block shuffle offset works",
+         "[shuffle][block]",
+         types,
+         single_item_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  const int distance = 
+    GENERATE_COPY(take(4, random(1 - params::tile_size, 
+                                 params::tile_size - 1)));
+
+  thrust::host_vector<type> h_data = d_data;
+  thrust::host_vector<type> h_ref(params::tile_size);
+
+  for (int i = 0; i < static_cast<int>(h_data.size()); i++)
+  {
+    const int source = i + distance;
+    h_ref[i] = (source >= 0) && (source < params::tile_size) 
+             ? h_data[source]
+             : h_data[i];
+  }
+
+  block_shuffle<params::items_per_thread,
+                params::block_dim_x,
+                params::block_dim_y,
+                params::block_dim_z>(d_data, offset_op_t{distance});
+
+  REQUIRE(h_ref == d_data);
+}
+
+CUB_TEST("Block shuffle rotate works",
+         "[shuffle][block]",
+         types,
+         single_item_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  thrust::device_vector<type> d_ref = d_data;
+
+  const unsigned int distance = 
+    GENERATE_COPY(take(4, random(0, params::tile_size - 1)));
+
+  thrust::host_vector<type> h_ref = d_data;
+  std::rotate(h_ref.begin(), h_ref.begin() + distance, h_ref.end());
+
+  block_shuffle<params::items_per_thread,
+                params::block_dim_x,
+                params::block_dim_y,
+                params::block_dim_z>(d_data, rotate_op_t{distance});
+
+  REQUIRE(h_ref == d_data);
+}
+
+CUB_TEST("Block shuffle up works",
+         "[shuffle][block]",
+         types,
+         items_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  thrust::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin(), d_data.end() - 1, d_ref.begin() + 1);
+  thrust::copy(d_data.begin(), d_data.begin() + 1, d_ref.begin());
+
+  block_shuffle<params::items_per_thread,
+                params::block_dim_x,
+                params::block_dim_y,
+                params::block_dim_z>(d_data, up_op_t{});
+
+  REQUIRE(d_ref == d_data);
+}
+
+CUB_TEST("Block shuffle up works when suffix is required",
+         "[shuffle][block]",
+         types,
+         items_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  const int target_thread_id = 
+    GENERATE_COPY(take(2, random(0, params::threads_in_block - 1)));
+
+  thrust::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin(), d_data.end() - 1, d_ref.begin() + 1);
+  thrust::copy(d_data.begin(), d_data.begin() + 1, d_ref.begin());
+
+  thrust::device_vector<type> d_suffix(1);
+  thrust::device_vector<type> d_suffix_ref(1);
+  thrust::copy(d_data.end() - 1, d_data.end(), d_suffix_ref.begin());
+
+  block_shuffle<params::items_per_thread,
+                params::block_dim_x,
+                params::block_dim_y,
+                params::block_dim_z>(
+    d_data,
+    up_with_suffix_op_t<type>{target_thread_id,
+                              thrust::raw_pointer_cast(d_suffix.data())});
+
+  REQUIRE(d_ref == d_data);
+  REQUIRE(d_suffix_ref == d_suffix);
+}
+
+CUB_TEST("Block shuffle down works",
+         "[shuffle][block]",
+         types,
+         items_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  thrust::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin() + 1, d_data.end(), d_ref.begin());
+  thrust::copy(d_data.end() - 1, d_data.end(), d_ref.end() - 1);
+
+  block_shuffle<params::items_per_thread,
+                params::block_dim_x,
+                params::block_dim_y,
+                params::block_dim_z>(d_data, down_op_t{});
+
+  REQUIRE(d_ref == d_data);
+}
+
+CUB_TEST("Block shuffle down works when prefix is required",
+         "[shuffle][block]",
+         types,
+         items_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_data(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_data);
+
+  const int target_thread_id = 
+    GENERATE_COPY(take(2, random(0, params::threads_in_block - 1)));
+
+  thrust::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin() + 1, d_data.end(), d_ref.begin());
+  thrust::copy(d_data.end() - 1, d_data.end(), d_ref.end() - 1);
+
+  thrust::device_vector<type> d_prefix(1);
+  thrust::device_vector<type> d_prefix_ref(1);
+  thrust::copy(d_data.begin(), d_data.begin() + 1, d_prefix_ref.begin());
+
+  block_shuffle<params::items_per_thread,
+                params::block_dim_x,
+                params::block_dim_y,
+                params::block_dim_z>(
+    d_data,
+    down_with_prefix_op_t<type>{target_thread_id,
+                                thrust::raw_pointer_cast(d_prefix.data())});
+
+  REQUIRE(d_ref == d_data);
+  REQUIRE(d_prefix_ref == d_prefix);
+}
diff --git a/include/cub/test/catch2_test_block_store.cu b/include/cub/test/catch2_test_block_store.cu
new file mode 100644
index 0000000..afa3e02
--- /dev/null
+++ b/include/cub/test/catch2_test_block_store.cu
@@ -0,0 +1,327 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockStoreAlgorithm /* StoreAlgorithm */>
+struct output_idx
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) * ItemsPerThread + item;
+  }
+};
+
+template <int ItemsPerThread, int ThreadsInBlock>
+struct output_idx<ItemsPerThread,
+                  ThreadsInBlock,
+                  cub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED>
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) + ThreadsInBlock * item;
+  }
+};
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockStoreAlgorithm StoreAlgorithm>
+__global__ void kernel(std::integral_constant<bool, true>,
+                       InputIteratorT input,
+                       OutputIteratorT output,
+                       int num_items)
+{
+  using input_t      = cub::detail::value_t<InputIteratorT>;
+  using block_store_t = cub::BlockStore<input_t,
+                                        ThreadsInBlock,
+                                        ItemsPerThread,
+                                        StoreAlgorithm>;
+  using storage_t = typename block_store_t::TempStorage;
+
+  __shared__ storage_t storage;
+  block_store_t block_store(storage);
+
+  input_t data[ItemsPerThread];
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx =
+      output_idx<ItemsPerThread, ThreadsInBlock, StoreAlgorithm>::get(i);
+
+    if (idx < num_items)
+    {
+      data[i] = input[idx];
+    }
+  }
+
+  if (ItemsPerThread * ThreadsInBlock == num_items)
+  {
+    block_store.Store(output, data);
+  }
+  else
+  {
+    block_store.Store(output, data, num_items);
+  }
+}
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockStoreAlgorithm /* StoreAlgorithm */>
+__global__ void kernel(std::integral_constant<bool, false>,
+                       InputIteratorT input,
+                       OutputIteratorT output,
+                       int num_items)
+{
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx =
+      output_idx<ItemsPerThread,
+                 ThreadsInBlock,
+                 cub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT>::get(i);
+
+    if (idx < num_items)
+    {
+      output[idx] = input[idx];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockStoreAlgorithm StoreAlgorithm,
+          typename InputIteratorT,
+          typename OutputIteratorT>
+void block_store(
+    InputIteratorT input,
+    OutputIteratorT output,
+    int num_items)
+{
+  using input_t = cub::detail::value_t<InputIteratorT>;
+  using block_store_t = cub::BlockStore<input_t,
+                                        ThreadsInBlock,
+                                        ItemsPerThread,
+                                        StoreAlgorithm>;
+  using storage_t = typename block_store_t::TempStorage;
+  constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48;
+
+  kernel<InputIteratorT,
+         OutputIteratorT,
+         ItemsPerThread,
+         ThreadsInBlock,
+         StoreAlgorithm><<<1, ThreadsInBlock>>>(
+    std::integral_constant<bool, sufficient_resources>{},
+    input,
+    output,
+    num_items);
+
+  REQUIRE( cudaSuccess == cudaPeekAtLastError() );
+  REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
+}
+
+
+// %PARAM% IPT it 1:11
+
+using types = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t>;
+using vec_types = c2h::type_list<long2, double2>;
+
+using even_threads_in_block = c2h::enum_type_list<int, 32, 128>;
+using odd_threads_in_block = c2h::enum_type_list<int, 15, 65>;
+using a_block_size = c2h::enum_type_list<int, 256>;
+
+using items_per_thread = c2h::enum_type_list<int, IPT>;
+using store_algorithm  = c2h::enum_type_list<
+  cub::BlockStoreAlgorithm,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE,             
+  cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>;
+
+using odd_store_algorithm = c2h::enum_type_list<
+  cub::BlockStoreAlgorithm,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE,
+  cub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
+  static constexpr int tile_size = items_per_thread * threads_in_block;
+  static constexpr cub::BlockStoreAlgorithm store_algorithm = 
+    c2h::get<3, TestType>::value;
+};
+
+CUB_TEST("Block store works with even block sizes",
+         "[store][block]",
+         types,
+         items_per_thread,
+         even_threads_in_block,
+         store_algorithm)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_store<params::items_per_thread,
+              params::threads_in_block,
+              params::store_algorithm>(thrust::raw_pointer_cast(d_input.data()),
+                                       thrust::raw_pointer_cast(d_output.data()),
+                                       static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
+CUB_TEST("Block store works with even odd sizes",
+         "[store][block]",
+         types,
+         items_per_thread,
+         odd_threads_in_block,
+         odd_store_algorithm)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_store<params::items_per_thread,
+              params::threads_in_block,
+              params::store_algorithm>(thrust::raw_pointer_cast(d_input.data()),
+                                       thrust::raw_pointer_cast(
+                                         d_output.data()),
+                                       static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
+CUB_TEST("Block store works with even vector types",
+         "[store][block]",
+         vec_types,
+         items_per_thread,
+         a_block_size,
+         store_algorithm)
+{
+  using params = params_t<TestType>;
+  using type = typename params::type;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_store<params::items_per_thread,
+              params::threads_in_block,
+              params::store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE( d_input == d_output );
+}
+
+CUB_TEST("Block store works with custom types",
+         "[store][block]",
+         items_per_thread,
+         store_algorithm)
+{
+  using type                     = c2h::custom_type_t<c2h::equal_comparable_t>;
+  constexpr int items_per_thread = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block = 64;
+  constexpr int tile_size        = items_per_thread * threads_in_block;
+  static constexpr cub::BlockStoreAlgorithm store_algorithm =
+    c2h::get<1, TestType>::value;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+
+  block_store<items_per_thread, threads_in_block, store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+CUB_TEST("Block store works with caching iterators",
+         "[store][block]",
+         items_per_thread,
+         store_algorithm)
+{
+  using type                     = int;
+  constexpr int items_per_thread = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block = 64;
+  constexpr int tile_size        = items_per_thread * threads_in_block;
+  static constexpr cub::BlockStoreAlgorithm store_algorithm =
+    c2h::get<1, TestType>::value;
+
+  thrust::device_vector<type> d_input(
+    GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(CUB_SEED(10), d_input);
+
+  thrust::device_vector<type> d_output(d_input.size());
+  cub::CacheModifiedOutputIterator<cub::CacheStoreModifier::STORE_DEFAULT, type> out(
+    thrust::raw_pointer_cast(d_output.data()));
+
+  block_store<items_per_thread, threads_in_block, store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    out,
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
diff --git a/include/cub/test/catch2_test_cdp_helper.h b/include/cub/test/catch2_test_cdp_helper.h
new file mode 100644
index 0000000..f48f590
--- /dev/null
+++ b/include/cub/test/catch2_test_cdp_helper.h
@@ -0,0 +1,170 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "catch2_test_helper.h"
+
+//! @file This file contains utilities for device-scope API tests
+//!
+//! Device-scope API in CUB can be launched from the host or device side.
+//! Utilities in this file facilitate testing in both cases.
+//!
+//!
+//! ```
+//! // Add PARAM to make CMake generate a test for both host and device launch:
+//! // %PARAM% TEST_CDP cdp 0:1
+//!
+//! // Declare CDP wrapper for CUB API. The wrapper will accept the same
+//! // arguments as the CUB API. The wrapper name is provided as the second argument.
+//! DECLARE_CDP_WRAPPER(cub::DeviceReduce::Sum, cub_reduce_sum);
+//!
+//! CUB_TEST("Reduce test", "[device][reduce]")
+//! {
+//!   // ...
+//!   // Invoke the wrapper from the test. It'll allocate temporary storage and
+//!   // invoke the CUB API on the host or device side while checking return
+//!   // codes and launch errors.
+//!   cub_reduce_sum(d_in, d_out, n, should_be_invoked_on_device);
+//! }
+//!
+//! ```
+//!
+//! Consult with `test/catch2_test_cdp_wrapper.cu` for more usage examples.
+
+#if !defined(TEST_CDP)
+#error Test file should contain %PARAM% TEST_CDP cdp 0:1
+#endif
+
+#if TEST_CDP == 1
+template <class ActionT, class... Args>
+__global__ void device_side_api_launch_kernel(std::uint8_t *d_temp_storage,
+                                              std::size_t *temp_storage_bytes,
+                                              cudaError_t *d_error,
+                                              ActionT action,
+                                              Args... args)
+{
+  *d_error = action(d_temp_storage, *temp_storage_bytes, args...);
+}
+
+template <class ActionT, class... Args>
+void device_side_api_launch(ActionT action, Args... args)
+{
+  std::uint8_t *d_temp_storage = nullptr;
+  thrust::device_vector<cudaError_t> d_error(1, cudaErrorInvalidValue);
+  thrust::device_vector<std::size_t> d_temp_storage_bytes(1, 0);
+  device_side_api_launch_kernel<<<1, 1>>>(d_temp_storage,
+                                          thrust::raw_pointer_cast(d_temp_storage_bytes.data()),
+                                          thrust::raw_pointer_cast(d_error.data()),
+                                          action,
+                                          args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == d_error[0]);
+
+  thrust::device_vector<std::uint8_t> temp_storage(d_temp_storage_bytes[0]);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  device_side_api_launch_kernel<<<1, 1>>>(d_temp_storage,
+                                          thrust::raw_pointer_cast(d_temp_storage_bytes.data()),
+                                          thrust::raw_pointer_cast(d_error.data()),
+                                          action,
+                                          args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == d_error[0]);
+}
+
+#define cdp_launch device_side_api_launch
+
+#define DECLARE_CDP_INVOCABLE(API, WRAPPED_API_NAME)                                               \
+  struct WRAPPED_API_NAME##_device_invocable_t                                                     \
+  {                                                                                                \
+    template <class... Ts>                                                                         \
+    CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage,                      \
+                                                std::size_t &temp_storage_bytes,                   \
+                                                Ts... args)                                        \
+    {                                                                                              \
+      return API(d_temp_storage, temp_storage_bytes, args...);                                     \
+    }                                                                                              \
+  };                                                                                               
+
+#define DECLARE_CDP_WRAPPER(API, WRAPPED_API_NAME)                                                 \
+  DECLARE_CDP_INVOCABLE(API, WRAPPED_API_NAME);                                                    \
+  template <class... As>                                                                           \
+  static void WRAPPED_API_NAME(As... args)                                                         \
+  {                                                                                                \
+    cdp_launch(WRAPPED_API_NAME##_device_invocable_t{}, args...);                                  \
+  }                                                                                                
+
+#else
+
+template <class ActionT, class... Args>
+void host_side_api_launch(ActionT action, Args... args)
+{
+  std::uint8_t *d_temp_storage = nullptr;
+  std::size_t temp_storage_bytes{};
+  cudaError_t error = action(d_temp_storage, temp_storage_bytes, args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == error);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  error = action(d_temp_storage, temp_storage_bytes, args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == error);
+}
+
+#define cdp_launch host_side_api_launch
+
+#define DECLARE_CDP_INVOCABLE(API, WRAPPED_API_NAME)                                               \
+  struct WRAPPED_API_NAME##_host_invocable_t                                                       \
+  {                                                                                                \
+    template <class... Ts>                                                                         \
+    CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage,                      \
+                                                std::size_t &temp_storage_bytes,                   \
+                                                Ts... args)                                        \
+    {                                                                                              \
+      return API(d_temp_storage, temp_storage_bytes, args...);                                     \
+    }                                                                                              \
+  };                                                                                               
+
+#define DECLARE_CDP_WRAPPER(API, WRAPPED_API_NAME)                                                 \
+  DECLARE_CDP_INVOCABLE(API, WRAPPED_API_NAME);                                                    \
+  template <class... As>                                                                           \
+  static void WRAPPED_API_NAME(As... args)                                                         \
+  {                                                                                                \
+    cdp_launch(WRAPPED_API_NAME##_host_invocable_t{}, args...);                                    \
+  }                                                                                                
+
+#endif
diff --git a/include/cub/test/catch2_test_cdp_wrapper.cu b/include/cub/test/catch2_test_cdp_wrapper.cu
new file mode 100644
index 0000000..edd7238
--- /dev/null
+++ b/include/cub/test/catch2_test_cdp_wrapper.cu
@@ -0,0 +1,229 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/count.h>
+
+#include <cuda/std/tuple>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_cdp_helper.h"
+#include "catch2_test_helper.h"
+
+// %PARAM% TEST_CDP cdp 0:1
+
+template <class T>
+__global__ void cub_api_example_x2_0_kernel(const T *d_in, T *d_out, int num_items)
+{
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < num_items)
+  {
+    d_out[i] = d_in[i] * T{2};
+  }
+}
+
+template <class T>
+__global__ void cub_api_example_x0_5_kernel(const T *d_in, T *d_out, int num_items)
+{
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < num_items)
+  {
+    d_out[i] = d_in[i] / T{2};
+  }
+}
+
+struct cub_api_example_t
+{
+  static constexpr int threads_in_block = 256;
+
+  template <class T, class KernelT>
+  CUB_RUNTIME_FUNCTION static cudaError_t invoke(std::uint8_t *d_temp_storage,
+                                                 std::size_t &temp_storage_bytes,
+                                                 KernelT kernel,
+                                                 const T *d_in,
+                                                 T *d_out,
+                                                 int num_items,
+                                                 bool should_be_invoked_on_device)
+  {
+    NV_IF_TARGET(NV_IS_HOST,
+                 (if (should_be_invoked_on_device) { return cudaErrorLaunchFailure; }),
+                 (if (!should_be_invoked_on_device) { return cudaErrorLaunchFailure; }));
+
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = static_cast<std::size_t>(num_items);
+      return cudaSuccess;
+    }
+
+    if (temp_storage_bytes != static_cast<std::size_t>(num_items))
+    {
+      return cudaErrorInvalidValue;
+    }
+
+    const int blocks_in_grid = (num_items + threads_in_block - 1) / threads_in_block;
+
+    return thrust::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, 0)
+      .doit(kernel, d_in, d_out, num_items);
+  }
+
+  template <class T>
+  CUB_RUNTIME_FUNCTION static cudaError_t x2_0(std::uint8_t *d_temp_storage,
+                                               std::size_t &temp_storage_bytes,
+                                               const T *d_in,
+                                               T *d_out,
+                                               int num_items,
+                                               bool should_be_invoked_on_device)
+  {
+    return invoke(d_temp_storage,
+                  temp_storage_bytes,
+                  cub_api_example_x2_0_kernel<T>,
+                  d_in,
+                  d_out,
+                  num_items,
+                  should_be_invoked_on_device);
+  }
+
+  template <class T>
+  CUB_RUNTIME_FUNCTION static cudaError_t x0_5(std::uint8_t *d_temp_storage,
+                                               std::size_t &temp_storage_bytes,
+                                               const T *d_in,
+                                               T *d_out,
+                                               int num_items,
+                                               bool should_be_invoked_on_device)
+  {
+    return invoke(d_temp_storage,
+                  temp_storage_bytes,
+                  cub_api_example_x0_5_kernel<T>,
+                  d_in,
+                  d_out,
+                  num_items,
+                  should_be_invoked_on_device);
+  }
+};
+
+DECLARE_CDP_WRAPPER(cub_api_example_t::x2_0, x2_0);
+DECLARE_CDP_WRAPPER(cub_api_example_t::x0_5, x0_5);
+
+CUB_TEST("CDP wrapper works with predefined invocables", "[test][utils]")
+{
+  int n = 42;
+  thrust::device_vector<int> in(n, 21);
+  thrust::device_vector<int> out(n);
+
+  int *d_in  = thrust::raw_pointer_cast(in.data());
+  int *d_out = thrust::raw_pointer_cast(out.data());
+
+  constexpr bool should_be_invoked_on_device = TEST_CDP;
+
+  {
+    x2_0(d_in, d_out, n, should_be_invoked_on_device);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(out.begin(), out.end(), 42));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+
+  {
+    x0_5(d_out, d_out, n, should_be_invoked_on_device);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(out.begin(), out.end(), 21));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+}
+
+struct custom_x2_0_invocable
+{
+  template <class T>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage,
+                                              std::size_t &temp_storage_bytes,
+                                              const T *d_in,
+                                              T *d_out,
+                                              int num_items,
+                                              bool should_be_invoked_on_device)
+  {
+    return cub_api_example_t::x2_0(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_in,
+                                   d_out,
+                                   num_items,
+                                   should_be_invoked_on_device);
+  }
+};
+
+struct custom_x0_5_invocable
+{
+  template <class T>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage,
+                                              std::size_t &temp_storage_bytes,
+                                              const T *d_in,
+                                              T *d_out,
+                                              int num_items,
+                                              bool should_be_invoked_on_device)
+  {
+    return cub_api_example_t::x0_5(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_in,
+                                   d_out,
+                                   num_items,
+                                   should_be_invoked_on_device);
+  }
+};
+
+CUB_TEST("CDP wrapper works with custom invocables", "[test][utils]")
+{
+  int n = 42;
+  thrust::device_vector<int> in(n, 21);
+  thrust::device_vector<int> out(n);
+
+  int *d_in  = thrust::raw_pointer_cast(in.data());
+  int *d_out = thrust::raw_pointer_cast(out.data());
+
+  constexpr bool should_be_invoked_on_device = TEST_CDP;
+
+  {
+    cdp_launch(custom_x2_0_invocable{}, d_in, d_out, n, should_be_invoked_on_device);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(out.begin(), out.end(), 42));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+
+  {
+    cdp_launch(custom_x0_5_invocable{}, d_out, d_out, n, should_be_invoked_on_device);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(out.begin(), out.end(), 21));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+}
diff --git a/include/cub/test/catch2_test_device_decoupled_look_back.cu b/include/cub/test/catch2_test_device_decoupled_look_back.cu
new file mode 100644
index 0000000..9b4b2f2
--- /dev/null
+++ b/include/cub/test/catch2_test_device_decoupled_look_back.cu
@@ -0,0 +1,168 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#undef NDEBUG
+#include <cub/device/device_scan.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cassert>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <class ScanTileStateT>
+__global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid)
+{
+  tile_state.InitializeStatus(blocks_in_grid);
+}
+
+template <class MessageT>
+__global__ void decoupled_look_back_kernel(cub::ScanTileState<MessageT> tile_state,
+                                           MessageT *tile_data)
+{
+  using scan_op_t         = cub::Sum;
+  using scan_tile_state_t = cub::ScanTileState<MessageT>;
+  using tile_prefix_op    = cub::TilePrefixCallbackOp<MessageT, scan_op_t, scan_tile_state_t>;
+  using temp_storage_t    = typename tile_prefix_op::TempStorage;
+
+  // Allocate temp storage in shared memory
+  __shared__ temp_storage_t temp_storage;
+
+  scan_op_t scan_op{};
+  const unsigned int threads_in_warp = 32;
+  const unsigned int tid             = threadIdx.x;
+
+  // Construct prefix op
+  tile_prefix_op prefix(tile_state, temp_storage, scan_op);
+  const unsigned int tile_idx = prefix.GetTileIdx();
+
+  // "Compute" tile aggregate
+  MessageT tile_aggregate = tile_data[tile_idx];
+
+  if (tile_idx == 0)
+  {
+    // There are no blocks to look back to, immediately set the inclusive state
+    if (tid == 0)
+    {
+      tile_state.SetInclusive(tile_idx, tile_aggregate);
+      tile_data[tile_idx] = tile_aggregate;
+    }
+  }
+  else
+  {
+    // Only the first warp in the block can perform the look back
+    const unsigned int warp_id = tid / threads_in_warp;
+
+    if (warp_id == 0)
+    {
+      // Perform the decoupled look-back
+      // Invocation of the prefix will block until the look-back is complete.
+      MessageT exclusive_prefix = prefix(tile_aggregate);
+
+      if (tid == 0)
+      {
+        MessageT inclusive_prefix = scan_op(exclusive_prefix, tile_aggregate);
+        tile_data[tile_idx]       = inclusive_prefix;
+      }
+    }
+    __syncthreads();
+
+    assert(tile_data[tile_idx] == prefix.GetInclusivePrefix());
+    assert(tile_aggregate == prefix.GetBlockAggregate());
+  }
+}
+
+using message_types = c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>;
+
+template <class MessageT>
+thrust::host_vector<MessageT>
+compute_reference(const thrust::device_vector<MessageT> &tile_aggregates)
+{
+  if (tile_aggregates.empty())
+  {
+    return {};
+  }
+
+  thrust::host_vector<MessageT> reference = tile_aggregates;
+  MessageT *h_reference = thrust::raw_pointer_cast(reference.data());
+
+  MessageT aggregate = h_reference[0];
+  for (std::size_t i = 1; i < reference.size(); i++)
+  {
+    aggregate += h_reference[i];
+    h_reference[i] = aggregate;
+  }
+
+  return reference;
+}
+
+CUB_TEST("Decoupled look-back works with various message types",
+         "[decoupled look-back][device]",
+         message_types)
+{
+  using message_t         = typename c2h::get<0, TestType>;
+  using scan_tile_state_t = cub::ScanTileState<message_t>;
+
+  const int max_tiles = 1024 * 1024;
+  const int num_tiles = GENERATE_COPY(take(10, random(1, max_tiles)));
+
+  thrust::device_vector<message_t> tile_data(num_tiles);
+  message_t *d_tile_data = thrust::raw_pointer_cast(tile_data.data());
+
+  c2h::gen(CUB_SEED(2), tile_data);
+  thrust::host_vector<message_t> reference = compute_reference(tile_data);
+
+  // Query temporary storage requirements
+  std::size_t temp_storage_bytes{};
+  scan_tile_state_t::AllocationSize(num_tiles, temp_storage_bytes);
+
+  // Allocate temporary storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Initialize temporary storage
+  scan_tile_state_t tile_status;
+  cudaError_t status = tile_status.Init(num_tiles, d_temp_storage, temp_storage_bytes);
+  REQUIRE(status == cudaSuccess);
+
+  const unsigned int threads_in_init_block = 256;
+  const unsigned int blocks_in_init_grid = cub::DivideAndRoundUp(num_tiles, threads_in_init_block);
+  init_kernel<<<blocks_in_init_grid, threads_in_init_block>>>(tile_status, num_tiles);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  // Launch decoupled look-back
+  const unsigned int threads_in_block = 256;
+  decoupled_look_back_kernel<<<num_tiles, threads_in_block>>>(tile_status, d_tile_data);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  REQUIRE(reference == tile_data);
+}
diff --git a/include/cub/test/catch2_test_device_radix_sort_custom.cu b/include/cub/test/catch2_test_device_radix_sort_custom.cu
new file mode 100644
index 0000000..ff89801
--- /dev/null
+++ b/include/cub/test/catch2_test_device_radix_sort_custom.cu
@@ -0,0 +1,1693 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_radix_sort.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+#include <bitset>
+#include <climits>
+#include <limits>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_cdp_helper.h"
+#include "catch2_test_helper.h"
+#include "cub/util_type.cuh"
+
+DECLARE_CDP_WRAPPER(cub::DeviceRadixSort::SortKeys, sort_keys);
+DECLARE_CDP_WRAPPER(cub::DeviceRadixSort::SortPairs, sort_pairs);
+DECLARE_CDP_WRAPPER(cub::DeviceRadixSort::SortKeysDescending, sort_keys_descending);
+DECLARE_CDP_WRAPPER(cub::DeviceRadixSort::SortPairsDescending, sort_pairs_descending);
+
+// %PARAM% TEST_CDP cdp 0:1
+
+using key   = c2h::custom_type_t<c2h::equal_comparable_t,
+                               c2h::lexicographical_less_comparable_t,
+                               c2h::lexicographical_greater_comparable_t>;
+using value = std::size_t;
+
+struct key_decomposer_t
+{
+  template <template <typename> class... Ps>
+  __host__ __device__ ::cuda::std::tuple<std::size_t &>
+  operator()(c2h::custom_type_t<Ps...> &key) const
+  {
+    return {key.key};
+  }
+};
+
+struct pair_decomposer_t
+{
+  template <template <typename> class... Ps>
+  __host__ __device__ ::cuda::std::tuple<std::size_t &, std::size_t &>
+  operator()(c2h::custom_type_t<Ps...> &key) const
+  {
+    return {key.key, key.val};
+  }
+};
+
+constexpr std::size_t bits_per_size_t = sizeof(std::size_t) * CHAR_BIT;
+constexpr std::size_t bits_per_pair_t = bits_per_size_t * 2;
+
+template <template <typename> class... Ps>
+std::bitset<bits_per_pair_t> to_bitset(c2h::custom_type_t<Ps...> &key, int begin_bit, int end_bit)
+{
+  std::bitset<bits_per_pair_t> bits(key.key);
+  bits <<= bits_per_size_t;
+  bits |= key.val;
+
+  for (int bit = 0; bit < begin_bit; bit++)
+  {
+    bits.reset(bit);
+  }
+
+  for (int bit = end_bit; bit < static_cast<int>(bits_per_pair_t); bit++)
+  {
+    bits.reset(bit);
+  }
+
+  return bits;
+}
+
+template <template <typename> class... Ps>
+void from_bitset(std::bitset<bits_per_pair_t> bits, c2h::custom_type_t<Ps...> &pair)
+{
+  pair.key = (bits >> bits_per_size_t).to_ullong();
+  bits <<= bits_per_size_t;
+  bits >>= bits_per_size_t;
+  pair.val = bits.to_ullong();
+}
+
+static thrust::host_vector<key> get_striped_keys(thrust::host_vector<key> keys,
+                                                 int begin_bit,
+                                                 int end_bit)
+{
+  if ((begin_bit > 0) || (end_bit < static_cast<int>(bits_per_pair_t)))
+  {
+    for (std::size_t i = 0; i < keys.size(); i++)
+    {
+      from_bitset(to_bitset(keys[i], begin_bit, end_bit), keys[i]);
+    }
+  }
+
+  return keys;
+}
+
+static thrust::host_vector<std::size_t> get_permutation(const thrust::host_vector<key> &h_keys,
+                                                        bool is_descending,
+                                                        int begin_bit,
+                                                        int end_bit)
+{
+  thrust::host_vector<key> h_striped_keys = get_striped_keys(h_keys, begin_bit, end_bit);
+
+  thrust::host_vector<std::size_t> h_permutation(h_keys.size());
+  thrust::sequence(h_permutation.begin(), h_permutation.end());
+
+  std::stable_sort(h_permutation.begin(), h_permutation.end(), [&](std::size_t a, std::size_t b) {
+    if (is_descending)
+    {
+      return h_striped_keys[a] > h_striped_keys[b];
+    }
+
+    return h_striped_keys[a] < h_striped_keys[b];
+  });
+
+  return h_permutation;
+}
+
+static thrust::device_vector<key> reference_sort_keys(const thrust::device_vector<key> &d_keys,
+                                                      bool is_descending,
+                                                      int begin_bit,
+                                                      int end_bit)
+{
+  thrust::host_vector<key> h_keys(d_keys);
+  thrust::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, begin_bit, end_bit);
+  thrust::host_vector<key> result(d_keys.size());
+  thrust::gather(h_permutation.cbegin(), h_permutation.cend(), h_keys.cbegin(), result.begin());
+  return result;
+}
+
+static std::pair<thrust::device_vector<key>, thrust::device_vector<value>>
+reference_sort_pairs(const thrust::device_vector<key> &d_keys,
+                     const thrust::device_vector<value> &d_values,
+                     bool is_descending,
+                     int begin_bit,
+                     int end_bit)
+{
+  thrust::host_vector<key> h_keys(d_keys);
+  thrust::host_vector<value> h_values(d_values);
+  thrust::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, begin_bit, end_bit);
+
+  thrust::host_vector<key> result_keys(d_keys.size());
+  thrust::host_vector<value> result_values(d_values.size());
+  thrust::gather(h_permutation.cbegin(),
+                 h_permutation.cend(),
+                 thrust::make_zip_iterator(h_keys.cbegin(), h_values.cbegin()),
+                 thrust::make_zip_iterator(result_keys.begin(), result_values.begin()));
+
+  return std::make_pair(result_keys, result_values);
+}
+
+CUB_TEST("Device radix sort works with parts of custom i128_t", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  thrust::device_vector<key> in_keys(num_items);
+  thrust::device_vector<key> out_keys(num_items);
+  c2h::gen(CUB_SEED(10), in_keys);
+
+  auto reference_keys = reference_sort_keys(in_keys, false, 64, 128);
+  sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+            thrust::raw_pointer_cast(out_keys.data()),
+            num_items,
+            key_decomposer_t{});
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+CUB_TEST("Device radix descending sort works with custom i128_t", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  thrust::device_vector<key> in_keys(num_items);
+  thrust::device_vector<key> out_keys(num_items);
+  c2h::gen(CUB_SEED(10), in_keys);
+
+  const bool is_descending = GENERATE(false, true);
+  auto reference_keys      = reference_sort_keys(in_keys, is_descending, 0, 128);
+
+  if (is_descending)
+  {
+    sort_keys_descending(thrust::raw_pointer_cast(in_keys.data()),
+                         thrust::raw_pointer_cast(out_keys.data()),
+                         num_items,
+                         pair_decomposer_t{});
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              pair_decomposer_t{});
+  }
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+CUB_TEST("Device radix sort can sort pairs with custom i128_t keys", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  thrust::device_vector<key> in_keys(num_items);
+  thrust::device_vector<key> out_keys(num_items);
+
+  thrust::device_vector<value> in_values(num_items);
+  thrust::device_vector<value> out_values(num_items);
+  c2h::gen(CUB_SEED(10), in_keys);
+  c2h::gen(CUB_SEED(1), in_values);
+
+  const bool is_descending = GENERATE(false, true);
+  auto reference           = reference_sort_pairs(in_keys, in_values, is_descending, 0, 128);
+
+  if (is_descending)
+  {
+    sort_pairs_descending(thrust::raw_pointer_cast(in_keys.data()),
+                          thrust::raw_pointer_cast(out_keys.data()),
+                          thrust::raw_pointer_cast(in_values.data()),
+                          thrust::raw_pointer_cast(out_values.data()),
+                          num_items,
+                          pair_decomposer_t{});
+  }
+  else
+  {
+    sort_pairs(thrust::raw_pointer_cast(in_keys.data()),
+               thrust::raw_pointer_cast(out_keys.data()),
+               thrust::raw_pointer_cast(in_values.data()),
+               thrust::raw_pointer_cast(out_values.data()),
+               num_items,
+               pair_decomposer_t{});
+  }
+
+  REQUIRE(reference.first == out_keys);
+  REQUIRE(reference.second == out_values);
+}
+
+struct double_buffer_sort_t
+{
+  bool is_descending;
+  int *selector;
+
+  template <class... As>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage,
+                                              std::size_t &temp_storage_bytes,
+                                              cub::DoubleBuffer<key> keys,
+                                              As... as)
+  {
+    const cudaError_t status =
+      is_descending
+        ? cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, keys, as...)
+        : cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keys, as...);
+
+    *selector = keys.selector;
+    return status;
+  }
+
+  template <class... As>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t *d_temp_storage,
+                                              std::size_t &temp_storage_bytes,
+                                              cub::DoubleBuffer<key> keys,
+                                              cub::DoubleBuffer<value> values,
+                                              As... as)
+  {
+    const cudaError_t status =
+      is_descending
+        ? cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    keys,
+                                                    values,
+                                                    as...)
+        : cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, keys, values, as...);
+
+    *selector = keys.selector;
+    return status;
+  }
+};
+
+CUB_TEST("Device radix sort works with custom i128_t (db)", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  int *selector = nullptr;
+  cudaMallocHost(&selector, sizeof(int));
+
+  thrust::device_vector<key> keys_1(num_items);
+  thrust::device_vector<key> keys_2(num_items);
+  c2h::gen(CUB_SEED(2), keys_1);
+
+  key *d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key *d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+
+  const bool is_descending = GENERATE(false, true);
+  auto reference_keys      = reference_sort_keys(keys_1, is_descending, 0, 128);
+  cdp_launch(double_buffer_sort_t{is_descending, selector}, keys, num_items, pair_decomposer_t{});
+
+  keys.selector = *selector;
+  cudaFreeHost(selector);
+
+  thrust::device_vector<key> &out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+CUB_TEST("Device radix sort works with custom i128_t keys (db)", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  int *selector = nullptr;
+  cudaMallocHost(&selector, sizeof(int));
+
+  thrust::device_vector<key> keys_1(num_items);
+  thrust::device_vector<key> keys_2(num_items);
+  c2h::gen(CUB_SEED(2), keys_1);
+
+  thrust::device_vector<value> values_1(num_items);
+  thrust::device_vector<value> values_2(num_items);
+  c2h::gen(CUB_SEED(1), values_1);
+
+  key *d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key *d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  value *d_values_1 = thrust::raw_pointer_cast(values_1.data());
+  value *d_values_2 = thrust::raw_pointer_cast(values_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+  cub::DoubleBuffer<value> values(d_values_1, d_values_2);
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_pairs(keys_1, values_1, is_descending, 0, 128);
+  cdp_launch(double_buffer_sort_t{is_descending, selector},
+             keys,
+             values,
+             num_items,
+             pair_decomposer_t{});
+
+  keys.selector   = *selector;
+  values.selector = *selector;
+  cudaFreeHost(selector);
+
+  thrust::device_vector<key> &out_keys     = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+  thrust::device_vector<value> &out_values = values.Current() == d_values_1 ? values_1 : values_2;
+
+  REQUIRE(reference_keys.first == out_keys);
+  REQUIRE(reference_keys.second == out_values);
+}
+
+CUB_TEST("Device radix descending sort works with bits of custom i128_t", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(1, random(max_items / 2, max_items)));
+
+  thrust::device_vector<key> in_keys(num_items);
+  thrust::device_vector<key> out_keys(num_items);
+  c2h::gen(CUB_SEED(2), in_keys);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_keys(in_keys, is_descending, begin_bit, end_bit);
+
+  if (is_descending)
+  {
+    sort_keys_descending(thrust::raw_pointer_cast(in_keys.data()),
+                         thrust::raw_pointer_cast(out_keys.data()),
+                         num_items,
+                         pair_decomposer_t{},
+                         begin_bit,
+                         end_bit);
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              pair_decomposer_t{},
+              begin_bit,
+              end_bit);
+  }
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+CUB_TEST("Device radix sort can sort pairs with bits of custom i128_t keys",
+         "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(1, random(max_items / 2, max_items)));
+
+  thrust::device_vector<key> in_keys(num_items);
+  thrust::device_vector<key> out_keys(num_items);
+
+  thrust::device_vector<value> in_values(num_items);
+  thrust::device_vector<value> out_values(num_items);
+  c2h::gen(CUB_SEED(2), in_keys);
+  c2h::gen(CUB_SEED(1), in_values);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference = reference_sort_pairs(in_keys, in_values, is_descending, begin_bit, end_bit);
+
+  if (is_descending)
+  {
+    sort_pairs_descending(thrust::raw_pointer_cast(in_keys.data()),
+                          thrust::raw_pointer_cast(out_keys.data()),
+                          thrust::raw_pointer_cast(in_values.data()),
+                          thrust::raw_pointer_cast(out_values.data()),
+                          num_items,
+                          pair_decomposer_t{},
+                          begin_bit,
+                          end_bit);
+  }
+  else
+  {
+    sort_pairs(thrust::raw_pointer_cast(in_keys.data()),
+               thrust::raw_pointer_cast(out_keys.data()),
+               thrust::raw_pointer_cast(in_values.data()),
+               thrust::raw_pointer_cast(out_values.data()),
+               num_items,
+               pair_decomposer_t{},
+               begin_bit,
+               end_bit);
+  }
+
+  REQUIRE(reference.first == out_keys);
+  REQUIRE(reference.second == out_values);
+}
+
+CUB_TEST("Device radix sort works with bits of custom i128_t (db)", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  int *selector = nullptr;
+  cudaMallocHost(&selector, sizeof(int));
+
+  thrust::device_vector<key> keys_1(num_items);
+  thrust::device_vector<key> keys_2(num_items);
+  c2h::gen(CUB_SEED(2), keys_1);
+
+  key *d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key *d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_keys(keys_1, is_descending, begin_bit, end_bit);
+  cdp_launch(double_buffer_sort_t{is_descending, selector},
+             keys,
+             num_items,
+             pair_decomposer_t{},
+             begin_bit,
+             end_bit);
+
+  keys.selector = *selector;
+  cudaFreeHost(selector);
+
+  thrust::device_vector<key> &out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+CUB_TEST("Device radix sort works with bits of custom i128_t keys (db)", "[radix][sort][device]")
+{
+  const int max_items = 1 << 18;
+  const int num_items = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  int *selector = nullptr;
+  cudaMallocHost(&selector, sizeof(int));
+
+  thrust::device_vector<key> keys_1(num_items);
+  thrust::device_vector<key> keys_2(num_items);
+  c2h::gen(CUB_SEED(2), keys_1);
+
+  thrust::device_vector<value> values_1(num_items);
+  thrust::device_vector<value> values_2(num_items);
+  c2h::gen(CUB_SEED(1), values_1);
+
+  key *d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key *d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  value *d_values_1 = thrust::raw_pointer_cast(values_1.data());
+  value *d_values_2 = thrust::raw_pointer_cast(values_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+  cub::DoubleBuffer<value> values(d_values_1, d_values_2);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_pairs(keys_1, values_1, is_descending, begin_bit, end_bit);
+  cdp_launch(double_buffer_sort_t{is_descending, selector},
+             keys,
+             values,
+             num_items,
+             pair_decomposer_t{},
+             begin_bit,
+             end_bit);
+
+  keys.selector   = *selector;
+  values.selector = *selector;
+  cudaFreeHost(selector);
+
+  thrust::device_vector<key> &out_keys     = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+  thrust::device_vector<value> &out_values = values.Current() == d_values_1 ? values_1 : values_2;
+
+  REQUIRE(reference_keys.first == out_keys);
+  REQUIRE(reference_keys.second == out_values);
+}
+
+#if TEST_CDP != 1
+
+// example-begin custom-type
+struct custom_t
+{
+  float f;
+  int unused;
+  long long int lli;
+
+  custom_t() = default;
+  custom_t(float f, long long int lli)
+      : f(f)
+      , unused(42)
+      , lli(lli)
+  {}
+};
+
+struct decomposer_t
+{
+  __host__ __device__ ::cuda::std::tuple<float &, long long int &>
+  operator()(custom_t &key) const
+  {
+    return {key.f, key.lli};
+  }
+};
+// example-end custom-type
+
+static __host__ std::ostream &operator<<(std::ostream &os, const custom_t &self) 
+{ 
+  return os << "{ " << self.f << ", " << self.lli << " }";
+}
+
+static __host__ __device__ bool operator==(const custom_t &lhs, const custom_t &rhs)
+{
+  return lhs.f == rhs.f && lhs.lli == rhs.lli;
+}
+
+CUB_TEST("Device radix sort works against some corner cases", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> in = {
+      {+2.5f, 4},
+      {-2.5f, 0},
+      {+1.1f, 3},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+3.7f, 5}
+    };
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t *d_in = thrust::raw_pointer_cast(in.data());
+    custom_t *d_out      = thrust::raw_pointer_cast(out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_in,
+                                   d_out,
+                                   num_items,
+                                   decomposer_t{});
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_in,
+                                   d_out,
+                                   num_items,
+                                   decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_output = {
+      {-2.5f, 0},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+1.1f, 3},
+      {+2.5f, 4},
+      {+3.7f, 5}
+    };
+    // example-end keys
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> in = {
+      {+1.1f, 2},
+      {+2.5f, 1},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5},
+      {+3.7f, 0}
+    };
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t *d_in = thrust::raw_pointer_cast(in.data());
+    custom_t *d_out      = thrust::raw_pointer_cast(out.data());
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_in,
+                                             d_out,
+                                             num_items,
+                                             decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_in,
+                                             d_out,
+                                             num_items,
+                                             decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_output = {
+      {+3.7f, 0},
+      {+2.5f, 1},
+      {+1.1f, 2},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5}
+    };
+    // example-end keys-descending
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_in = {
+      {+2.5f, 4},
+      {-2.5f, 0},
+      {+1.1f, 3},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+3.7f, 5}
+    };
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+
+    const custom_t *d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t *d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+
+    thrust::device_vector<int> vals_in = {4, 0, 3, 1, 2, 5};
+    thrust::device_vector<int> vals_out(num_items);
+
+    const int *d_vals_in = thrust::raw_pointer_cast(vals_in.data());
+    int *d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    d_keys_in,
+                                    d_keys_out,
+                                    d_vals_in,
+                                    d_vals_out,
+                                    num_items,
+                                    decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    d_keys_in,
+                                    d_keys_out,
+                                    d_vals_in,
+                                    d_vals_out,
+                                    num_items,
+                                    decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {-2.5f, 0},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+1.1f, 3},
+      {+2.5f, 4},
+      {+3.7f, 5}
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 3, 4, 5};
+    // example-end pairs
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_in = {
+      {+1.1f, 2},
+      {+2.5f, 1},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5},
+      {+3.7f, 0}
+    };
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+
+    const custom_t *d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t *d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+
+    thrust::device_vector<int> vals_in = {2, 1, 4, 3, 5, 0};
+    thrust::device_vector<int> vals_out(num_items);
+
+    const int *d_vals_in = thrust::raw_pointer_cast(vals_in.data());
+    int *d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys_in,
+                                              d_keys_out,
+                                              d_vals_in,
+                                              d_vals_out,
+                                              num_items,
+                                              decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys_in,
+                                              d_keys_out,
+                                              d_vals_in,
+                                              d_vals_out,
+                                              num_items,
+                                              decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {+3.7f, 0},
+      {+2.5f, 1},
+      {+1.1f, 2},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5}
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 4, 3, 5};
+    // example-end pairs-descending
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+}
+
+CUB_TEST("Device radix sort works against some corner cases (db)", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys-db
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+2.5f, 4},
+      {-2.5f, 0},
+      {+1.1f, 3},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+3.7f, 5}
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_keys,
+                                   num_items,
+                                   decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_keys,
+                                   num_items,
+                                   decomposer_t{});
+
+    thrust::device_vector<custom_t> &current = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {-2.5f, 0},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+1.1f, 3},
+      {+2.5f, 4},
+      {+3.7f, 5}
+    };
+    // example-end keys-db
+
+    REQUIRE(expected_output == current);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending-db
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+1.1f, 2},
+      {+2.5f, 1},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5},
+      {+3.7f, 0}
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             num_items,
+                                             decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             num_items,
+                                             decomposer_t{});
+
+    thrust::device_vector<custom_t> &current = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {+3.7f, 0},
+      {+2.5f, 1},
+      {+1.1f, 2},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5}
+    };
+    // example-end keys-descending-db
+
+    REQUIRE(expected_output == current);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs-db
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+2.5f, 4},
+      {-2.5f, 0},
+      {+1.1f, 3},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+3.7f, 5}
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    thrust::device_vector<int> vals_buf = {4, 0, 3, 1, 2, 5};
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    int *d_vals_buf     = thrust::raw_pointer_cast(vals_buf.data());
+    int *d_vals_alt_buf = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    d_keys,
+                                    d_vals,
+                                    num_items,
+                                    decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    d_keys,
+                                    d_vals,
+                                    num_items,
+                                    decomposer_t{});
+
+    thrust::device_vector<custom_t> &current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int> &current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {-2.5f, 0},
+      {+0.0f, 1},
+      {-0.0f, 2},
+      {+1.1f, 3},
+      {+2.5f, 4},
+      {+3.7f, 5}
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 3, 4, 5};
+    // example-end pairs-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending-db
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    const int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+1.1f, 2},
+      {+2.5f, 1},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5},
+      {+3.7f, 0}
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    thrust::device_vector<int> vals_buf = {2, 1, 4, 3, 5, 0};
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    int *d_vals_buf     = thrust::raw_pointer_cast(vals_buf.data());
+    int *d_vals_alt_buf = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              d_vals,
+                                              num_items,
+                                              decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              d_vals,
+                                              num_items,
+                                              decomposer_t{});
+
+    thrust::device_vector<custom_t> &current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int> &current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {+3.7f, 0},
+      {+2.5f, 1},
+      {+1.1f, 2},
+      {-0.0f, 4},
+      {+0.0f, 3},
+      {-2.5f, 5}
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 4, 3, 5};
+    // example-end pairs-descending-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+}
+
+CUB_TEST("Device radix sort works against some corner cases (bits)", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys-bits
+    const int num_items = 2;
+    thrust::device_vector<custom_t> in = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  -----------> 
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  -----------> 
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t *d_in = thrust::raw_pointer_cast(in.data());
+    custom_t *d_out      = thrust::raw_pointer_cast(out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_in,
+                                   d_out,
+                                   num_items,
+                                   decomposer_t{},
+                                   begin_bit,
+                                   end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_in,
+                                   d_out,
+                                   num_items,
+                                   decomposer_t{},
+                                   begin_bit,
+                                   end_bit);
+
+    thrust::device_vector<custom_t> expected_output = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61}  //
+    };
+    // example-end keys-bits
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending-bits
+    const int num_items = 2;
+    thrust::device_vector<custom_t> in = {
+      {42.4f, 1ll << 60},
+      {24.2f, 1ll << 61} 
+    };
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  -----------> 
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  -----------> 
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t *d_in = thrust::raw_pointer_cast(in.data());
+    custom_t *d_out      = thrust::raw_pointer_cast(out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_in,
+                                             d_out,
+                                             num_items,
+                                             decomposer_t{},
+                                             begin_bit,
+                                             end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_in,
+                                             d_out,
+                                             num_items,
+                                             decomposer_t{},
+                                             begin_bit,
+                                             end_bit);
+
+    thrust::device_vector<custom_t> expected_output = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+    // example-end keys-descending-bits
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs-bits
+    const int num_items = 2;
+    thrust::device_vector<custom_t> keys_in = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+
+    thrust::device_vector<int> vals_in = { 1, 0 };
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  -----------> 
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+    thrust::device_vector<int> vals_out(num_items);
+
+    const custom_t *d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t *d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+    const int *d_vals_in      = thrust::raw_pointer_cast(vals_in.data());
+    int *d_vals_out           = thrust::raw_pointer_cast(vals_out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_keys_in,
+                                   d_keys_out,
+                                   d_vals_in,
+                                   d_vals_out,
+                                   num_items,
+                                   decomposer_t{},
+                                   begin_bit,
+                                   end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_keys_in,
+                                   d_keys_out,
+                                   d_vals_in,
+                                   d_vals_out,
+                                   num_items,
+                                   decomposer_t{},
+                                   begin_bit,
+                                   end_bit);
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61}  //
+    };
+
+    thrust::device_vector<int> expected_vals = { 0, 1 };
+    // example-end pairs-bits
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending-bits
+    const int num_items = 2;
+    thrust::device_vector<custom_t> keys_in = {
+      {42.4f, 1ll << 60},
+      {24.2f, 1ll << 61} 
+    };
+
+    thrust::device_vector<int> vals_in = { 1, 0 };
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  -----------> 
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  -----------> 
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+    thrust::device_vector<int> vals_out(num_items);
+
+    const custom_t *d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t *d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+    const int *d_vals_in      = thrust::raw_pointer_cast(vals_in.data());
+    int *d_vals_out           = thrust::raw_pointer_cast(vals_out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys_in,
+                                              d_keys_out,
+                                              d_vals_in,
+                                              d_vals_out,
+                                              num_items,
+                                              decomposer_t{},
+                                              begin_bit,
+                                              end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys_in,
+                                              d_keys_out,
+                                              d_vals_in,
+                                              d_vals_out,
+                                              num_items,
+                                              decomposer_t{},
+                                              begin_bit,
+                                              end_bit);
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+
+    thrust::device_vector<int> expected_vals = { 0, 1 };
+    // example-end pairs-descending-bits
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+}
+
+CUB_TEST("Device radix sort works against some corner cases (bits) (db)", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys-bits-db
+    const int num_items = 2;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_keys,
+                                   num_items,
+                                   decomposer_t{},
+                                   begin_bit,
+                                   end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   d_keys,
+                                   num_items,
+                                   decomposer_t{},
+                                   begin_bit,
+                                   end_bit);
+
+    thrust::device_vector<custom_t> &current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61}  //
+    };
+    // example-end keys-bits-db
+
+    REQUIRE(expected_output == current_keys);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending-bits-db
+    const int num_items                      = 2;
+    thrust::device_vector<custom_t> keys_buf = { //
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61}  //
+    };
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             num_items,
+                                             decomposer_t{},
+                                             begin_bit,
+                                             end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             d_keys,
+                                             num_items,
+                                             decomposer_t{},
+                                             begin_bit,
+                                             end_bit);
+
+    thrust::device_vector<custom_t> &current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+    // example-end keys-descending-bits-db
+
+    REQUIRE(expected_output == current_keys);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs-bits-db
+    const int num_items                      = 2;
+    thrust::device_vector<custom_t> keys_buf = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+
+    thrust::device_vector<int> vals_buf = {1, 0};
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+    int *d_vals_buf          = thrust::raw_pointer_cast(vals_buf.data());
+    int *d_vals_alt_buf      = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    d_keys,
+                                    d_vals,
+                                    num_items,
+                                    decomposer_t{},
+                                    begin_bit,
+                                    end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    d_keys,
+                                    d_vals,
+                                    num_items,
+                                    decomposer_t{},
+                                    begin_bit,
+                                    end_bit);
+
+    thrust::device_vector<custom_t> &current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int> &current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61}  //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1};
+    // example-end pairs-bits-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending-bits-db
+    const int num_items = 2;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61}  //
+    };
+
+    thrust::device_vector<int> vals_buf = {1, 0};
+
+    const int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    const int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    custom_t *d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t *d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+    int *d_vals_buf          = thrust::raw_pointer_cast(vals_buf.data());
+    int *d_vals_alt_buf      = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t *d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              d_vals,
+                                              num_items,
+                                              decomposer_t{},
+                                              begin_bit,
+                                              end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_keys,
+                                              d_vals,
+                                              num_items,
+                                              decomposer_t{},
+                                              begin_bit,
+                                              end_bit);
+
+    thrust::device_vector<custom_t> &current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int> &current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60}  //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1};
+    // example-end pairs-descending-bits-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+}
+#endif
diff --git a/include/cub/test/catch2_test_helper.h b/include/cub/test/catch2_test_helper.h
new file mode 100644
index 0000000..0a07315
--- /dev/null
+++ b/include/cub/test/catch2_test_helper.h
@@ -0,0 +1,206 @@
+/******************************************************************************
+* Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#pragma once
+
+#include <metal.hpp>
+#include <type_traits>
+#include <cstdint>
+#include <tuple>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include "cub/util_compiler.cuh"
+#include "test_util_vec.h"
+
+// cudafe considers some catch2 variables as unused. We have to suppress
+// these warnings until NVBug 3423950 is addressed.
+#if (CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC) || \
+    (CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG) || \
+    defined(__ICC) || defined(_NVHPC_CUDA)
+#  if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#    pragma nv_diag_suppress 177
+#  else
+#    pragma diag_suppress 177
+#  endif
+#endif
+
+#ifdef CUB_CONFIG_MAIN
+#define CATCH_CONFIG_RUNNER
+#endif
+#include <catch2/catch.hpp>
+
+#ifndef VAR_IDX
+#define VAR_IDX 0
+#endif
+
+namespace c2h
+{
+
+template <typename... Ts>
+using type_list = metal::list<Ts...>;
+
+template <typename TypeList>
+using size = metal::size<TypeList>;
+
+template <std::size_t Index, typename TypeList>
+using get = metal::at<TypeList, metal::number<Index>>;
+
+template <class... TypeLists>
+using cartesian_product = metal::cartesian<TypeLists...>;
+
+template <typename T, T... Ts>
+using enum_type_list = c2h::type_list<std::integral_constant<T, Ts>...>;
+
+template <typename T0, typename T1>
+using pair = metal::pair<T0, T1>;
+
+template <typename P>
+using first = metal::first<P>;
+
+template <typename P>
+using second = metal::second<P>;
+
+template <std::size_t start, std::size_t size, std::size_t stride = 1>
+using iota = metal::iota<metal::number<start>, metal::number<size>, metal::number<stride>>;
+
+} // namespace c2h 
+
+namespace detail
+{
+  template <class T>
+  std::vector<T> to_vec(thrust::device_vector<T> const& vec)
+  {
+    thrust::host_vector<T> tmp = vec;
+    return std::vector<T>{tmp.begin(), tmp.end()};
+  }
+
+  template <class T>
+  std::vector<T> to_vec(thrust::host_vector<T> const& vec)
+  {
+    return std::vector<T>{vec.begin(), vec.end()};
+  }
+
+  template <class T>
+  std::vector<T> to_vec(std::vector<T> const& vec)
+  {
+    return vec;
+  }
+}
+
+#define REQUIRE_APPROX_EQ(ref,out) { \
+  auto vec_ref = detail::to_vec(ref);  \
+  auto vec_out = detail::to_vec(out);  \
+  REQUIRE_THAT(vec_ref, Catch::Approx(vec_out)); \
+}
+
+#include <c2h/generators.cuh>
+#include <c2h/custom_type.cuh>
+
+
+#define CUB_TEST_NAME_IMPL(NAME, PARAM) \
+  CUB_TEST_STR(NAME) "(" CUB_TEST_STR(PARAM) ")"
+
+#define CUB_TEST_NAME(NAME) \
+  CUB_TEST_NAME_IMPL(NAME, VAR_IDX)
+
+#define CUB_TEST_CONCAT(A, B) CUB_TEST_CONCAT_INNER(A, B)
+#define CUB_TEST_CONCAT_INNER(A, B) A ## B
+
+#define CUB_TEST_IMPL(ID, NAME, TAG, ...)                                       \
+  using CUB_TEST_CONCAT(types_, ID) =                                           \
+    c2h::cartesian_product<__VA_ARGS__>;                                        \
+  TEMPLATE_LIST_TEST_CASE(CUB_TEST_NAME(NAME), TAG, CUB_TEST_CONCAT(types_, ID))
+
+#define CUB_TEST(NAME, TAG, ...) \
+  CUB_TEST_IMPL(__LINE__, NAME, TAG, __VA_ARGS__)
+
+#define CUB_TEST_LIST_IMPL(ID, NAME, TAG, ...)                                  \
+  using CUB_TEST_CONCAT(types_, ID) =                                           \
+    c2h::type_list<__VA_ARGS__>;                                                \
+  TEMPLATE_LIST_TEST_CASE(CUB_TEST_NAME(NAME), TAG, CUB_TEST_CONCAT(types_, ID))
+
+#define CUB_TEST_LIST(NAME, TAG, ...) \
+  CUB_TEST_LIST_IMPL(__LINE__, NAME, TAG, __VA_ARGS__)
+
+#define CUB_TEST_STR(a) #a
+
+#define CUB_SEED(N)                                                            \
+  c2h::seed_t{                                                                 \
+    GENERATE_COPY(                                                             \
+      take(N,                                                                  \
+           random(std::numeric_limits<unsigned long long int>::min(),          \
+                  std::numeric_limits<unsigned long long int>::max())))        \
+  }
+
+#ifdef CUB_CONFIG_MAIN
+#include <cuda_runtime.h>
+
+static int device_guard(int device_id)
+{
+  int device_count {};
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess)
+  {
+    std::cerr << "Can't query devices number." << std::endl;
+    std::exit(-1);
+  }
+
+  if (device_id >= device_count || device_id < 0)
+  {
+    std::cerr << "Invalid device ID: " << device_id << std::endl;
+    std::exit(-1);
+  }
+
+  return device_id;
+}
+
+
+int main(int argc, char *argv[])
+{
+  Catch::Session session;
+
+  int device_id {};
+
+  // Build a new parser on top of Catch's
+  using namespace Catch::clara;
+  auto cli = session.cli()
+           | Opt(device_id, "device")["-d"]["--device"]("device id to use");
+  session.cli(cli);
+
+  int returnCode = session.applyCommandLine(argc, argv);
+  if(returnCode != 0)
+  {
+    return returnCode;
+  }
+
+  cudaSetDevice(device_guard(device_id));
+
+  return session.run(argc, argv);
+}
+#endif
+
diff --git a/include/cub/test/catch2_test_printing.cu b/include/cub/test/catch2_test_printing.cu
new file mode 100644
index 0000000..a2bdef7
--- /dev/null
+++ b/include/cub/test/catch2_test_printing.cu
@@ -0,0 +1,36 @@
+#include "test_util.h"
+
+#include "catch2_test_helper.h"
+
+template <typename T>
+std::string print(T val) 
+{
+  std::stringstream ss;
+  ss << val;
+  return ss.str();
+}
+
+#if CUB_IS_INT128_ENABLED
+TEST_CASE("Test utils can print __int128", "[test][utils]")
+{
+  REQUIRE( print(__int128_t{0}) == "0" );
+  REQUIRE( print(__int128_t{42}) == "42" );
+  REQUIRE( print(__int128_t{-1}) == "-1" );
+  REQUIRE( print(__int128_t{-42}) == "-42" );
+  REQUIRE( print(-1 * (__int128_t{1} << 120)) == "-1329227995784915872903807060280344576" );
+}
+
+TEST_CASE("Test utils can print __uint128", "[test][utils]")
+{
+  REQUIRE( print(__uint128_t{0}) == "0" );
+  REQUIRE( print(__uint128_t{1}) == "1" );
+  REQUIRE( print(__uint128_t{42}) == "42" );
+  REQUIRE( print(__uint128_t{1} << 120) == "1329227995784915872903807060280344576" );
+}
+#endif
+
+TEST_CASE("Test utils can print KeyValuePair", "[test][utils]")
+{
+  REQUIRE( print(cub::KeyValuePair<int, int>{42, -42}) == "(42,-42)" );
+}
+
diff --git a/include/cub/test/catch2_test_radix_operations.cu b/include/cub/test/catch2_test_radix_operations.cu
new file mode 100644
index 0000000..5925ebc
--- /dev/null
+++ b/include/cub/test/catch2_test_radix_operations.cu
@@ -0,0 +1,686 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/radix_rank_sort_operations.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/host_vector.h>
+
+#include <bitset>
+#include <climits>
+#include <limits>
+#include <type_traits>
+
+#include <catch2_test_helper.h>
+
+template <typename KeyT>
+struct fundamental_extractor_t
+{
+  std::uint32_t bit_start;
+  std::uint32_t mask;
+
+  __host__ __device__ fundamental_extractor_t(std::uint32_t bit_start = 0,
+                                              std::uint32_t num_bits  = 0)
+      : bit_start(bit_start)
+      , mask((1 << num_bits) - 1)
+  {}
+
+  __host__ __device__ std::uint32_t Digit(KeyT key) const
+  {
+    return std::uint32_t(key >> KeyT(bit_start)) & mask;
+  }
+};
+
+template <class T>
+thrust::host_vector<std::uint8_t> get_random_buffer()
+{
+  thrust::device_vector<std::uint8_t> buffer(sizeof(T));
+  c2h::gen(CUB_SEED(3), buffer);
+  return buffer;
+}
+
+constexpr int max_digit_bits = sizeof(std::uint32_t) * CHAR_BIT;
+using digit_bits_t           = std::bitset<max_digit_bits>;
+
+digit_bits_t buffer_to_digit_bits(const char *buffer, int current_bit, int num_bits)
+{
+  digit_bits_t dst; // all bits set to zero
+
+  for (int bit = current_bit; bit < current_bit + num_bits; bit++)
+  {
+    const int dst_bit  = bit - current_bit;
+    const int src_byte = bit / CHAR_BIT;
+    const int src_bit  = bit % CHAR_BIT;
+
+    std::bitset<CHAR_BIT> src(buffer[src_byte]);
+    dst[dst_bit] = src[src_bit];
+  }
+
+  return dst;
+}
+
+using fundamental_types = c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>;
+using a_few_fundamental_types = c2h::type_list<std::uint8_t, std::uint64_t>;
+
+/**
+ * This test checks that radix operations can extract certain bits out of unsigned integers.
+ * Test runs for all possible combinations of `current_bit` and `num_bits`.
+ * Example for `current_bit = 5`, and `num_bits = 4`:
+ *
+ *          [-------]
+ *    src: 1 1 0 0 1 1 0 0 1 1
+ *    bit: 9 8 7 6 5 4 3 2 1 0
+ *    dst: 0 0 0 0 0 0 1 0 0 1
+ *
+ */
+CUB_TEST("Radix operations extract digits from fundamental types",
+         "[radix][operations]",
+         fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using extractor_t  = fundamental_extractor_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  auto decomposer            = decomposer_t{};
+  constexpr int max_key_bits = sizeof(key_t) * CHAR_BIT;
+  REQUIRE(traits::default_end_bit(decomposer) == max_key_bits);
+
+  key_t val{};
+  thrust::host_vector<char> output_buffer_mem(sizeof(std::uint32_t));
+  const thrust::host_vector<char> input_buffer_mem = get_random_buffer<key_t>();
+
+  char *output_buffer      = thrust::raw_pointer_cast(output_buffer_mem.data());
+  const char *input_buffer = thrust::raw_pointer_cast(input_buffer_mem.data());
+  std::memcpy(&val, input_buffer, sizeof(key_t));
+
+  for (int current_bit = 0; current_bit < max_key_bits; current_bit++)
+  {
+    const int max_bits = std::min(max_key_bits - current_bit, max_digit_bits);
+
+    for (int num_bits = 1; num_bits < max_bits; num_bits++)
+    {
+      auto extractor =
+        traits::template digit_extractor<extractor_t>(current_bit, num_bits, decomposer);
+
+      std::uint32_t digit = extractor.Digit(val);
+      std::memcpy(output_buffer, &digit, sizeof(std::uint32_t));
+
+      digit_bits_t result    = buffer_to_digit_bits(output_buffer, 0, num_bits);
+      digit_bits_t reference = buffer_to_digit_bits(input_buffer, current_bit, num_bits);
+
+      REQUIRE(reference == result);
+    }
+  }
+}
+
+template <class T>
+struct tuple_decomposer_t;
+
+template <class... Ts>
+struct tuple_decomposer_t<::cuda::std::tuple<Ts...>>
+{
+  template <std::size_t... Is>
+  __host__ __device__ ::cuda::std::tuple<Ts &...> extract(::cuda::std::tuple<Ts...> &key,
+                                                          thrust::index_sequence<Is...>) const
+  {
+    return ::cuda::std::tie(::cuda::std::get<Is>(key)...);
+  }
+
+  __host__ __device__ ::cuda::std::tuple<Ts &...> operator()(::cuda::std::tuple<Ts...> &key) const
+  {
+    return extract(key, thrust::make_index_sequence<sizeof...(Ts)>{});
+  }
+};
+
+// clang-format off
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if<I == 0>::type 
+buffer_to_tpl_helper(const char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  const std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(&::cuda::std::get<I>(tpl), buffer, element_size);
+}
+
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if <I != 0>::type
+buffer_to_tpl_helper(const char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  const std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(&::cuda::std::get<I>(tpl), buffer, element_size);
+  buffer_to_tpl_helper<I - 1>(buffer + element_size, tpl);
+}
+
+template <class... Ts>
+void buffer_to_tpl(const char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  buffer_to_tpl_helper<sizeof...(Ts) - 1>(buffer, tpl);
+}
+
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if<I == 0>::type 
+tpl_to_buffer_helper(char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  const std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(buffer, &::cuda::std::get<I>(tpl), element_size);
+}
+
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if <I != 0>::type
+tpl_to_buffer_helper(char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  const std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(buffer, &::cuda::std::get<I>(tpl), element_size);
+  tpl_to_buffer_helper<I - 1>(buffer + element_size, tpl);
+}
+
+template <class... Ts>
+void tpl_to_buffer(char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  tpl_to_buffer_helper<sizeof...(Ts) - 1>(buffer, tpl);
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if<I >= sizeof...(Ts), int>::type 
+tpl_to_max_bits(::cuda::std::tuple<Ts...> &)
+{
+  return 0;
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if <I < sizeof...(Ts), int>::type
+tpl_to_max_bits(::cuda::std::tuple<Ts...> &tpl)
+{
+  const std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  return element_size * CHAR_BIT + tpl_to_max_bits<I + 1>(tpl);
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if<I >= sizeof...(Ts)>::type 
+tpl_to_min(::cuda::std::tuple<Ts...> &)
+{}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if <I < sizeof...(Ts)>::type
+tpl_to_min(::cuda::std::tuple<Ts...> &tpl)
+{
+  using T = typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type;
+  ::cuda::std::get<I>(tpl) = std::numeric_limits<T>::lowest();
+  tpl_to_min<I + 1>(tpl);
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if<I >= sizeof...(Ts)>::type 
+tpl_to_max(::cuda::std::tuple<Ts...> &)
+{}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if <I < sizeof...(Ts)>::type
+tpl_to_max(::cuda::std::tuple<Ts...> &tpl)
+{
+  using T = typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type;
+  ::cuda::std::get<I>(tpl) = std::numeric_limits<T>::max();
+  tpl_to_max<I + 1>(tpl);
+}
+// clang-format on
+
+/**
+ * This test checks that radix operations can extract certain bits out of aggregate types.
+ * Test runs for all possible combinations of `current_bit` and `num_bits` excluding padding bits.
+ * For example, `struct custom_t { short s = 65535; float f = -42.2f; };` has the following binary 
+ * representation:
+ *
+ *    <------------ `.f` ------------><-- padding ---><---- `.s` ---->
+ *    s< exp. ><----- mantissa ------><-- padding ---><--- short ---->
+ *    1100000010000110011001100110011000000000000000001111111111111111
+ *                               +---~                ~--+
+ *    <           <----  higher bits  /  lower bits  ---->           >
+ *
+ * For `current_bit = 12`, and `num_bits = 9`:
+ *    dst: 0000011011111
+ *         <   fp  ><sh>
+ *
+ */
+template <class... Ts>
+void test_tuple()
+{
+  using tpl_t        = ::cuda::std::tuple<Ts...>;
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+  using extractor_t  = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  tpl_t tpl{};
+  thrust::host_vector<char> output_buffer_mem(sizeof(std::uint32_t));
+  const thrust::host_vector<char> input_buffer_mem = get_random_buffer<tpl_t>();
+
+  char *output_buffer      = thrust::raw_pointer_cast(output_buffer_mem.data());
+  const char *input_buffer = thrust::raw_pointer_cast(input_buffer_mem.data());
+  buffer_to_tpl(input_buffer, tpl);
+
+  auto decomposer        = decomposer_t{};
+  const int max_key_bits = tpl_to_max_bits(tpl);
+  REQUIRE(traits::default_end_bit(decomposer) == max_key_bits);
+
+  for (int current_bit = 0; current_bit < max_key_bits; current_bit++)
+  {
+    const int max_bits = std::min(max_key_bits - current_bit, max_digit_bits);
+
+    for (int num_bits = 1; num_bits < max_bits; num_bits++)
+    {
+      auto extractor =
+        traits::template digit_extractor<extractor_t>(current_bit, num_bits, decomposer);
+
+      std::uint32_t digit = extractor.Digit(tpl);
+      std::memcpy(output_buffer, &digit, sizeof(std::uint32_t));
+
+      digit_bits_t result    = buffer_to_digit_bits(output_buffer, 0, num_bits);
+      digit_bits_t reference = buffer_to_digit_bits(input_buffer, current_bit, num_bits);
+
+      // Provides readable error messages:
+      //  00000000000000000000000000000000
+      //  ==
+      //  00000000000000000000000000000001
+      REQUIRE(reference == result);
+    }
+  }
+}
+
+CUB_TEST("Radix operations extract digits from pairs",
+         "[radix][operations]",
+         fundamental_types,
+         fundamental_types)
+{
+  test_tuple<typename c2h::get<0, TestType>, //
+             typename c2h::get<1, TestType>>();
+}
+
+CUB_TEST("Radix operations extract digits from triples",
+         "[radix][operations]",
+         fundamental_types,
+         fundamental_types,
+         fundamental_types)
+{
+  test_tuple<typename c2h::get<0, TestType>, //
+             typename c2h::get<1, TestType>, //
+             typename c2h::get<2, TestType>>();
+}
+
+CUB_TEST("Radix operations extract digits from tetrads",
+         "[radix][operations]",
+         a_few_fundamental_types,
+         a_few_fundamental_types,
+         a_few_fundamental_types,
+         a_few_fundamental_types)
+{
+  test_tuple<typename c2h::get<0, TestType>, //
+             typename c2h::get<1, TestType>, //
+             typename c2h::get<2, TestType>, //
+             typename c2h::get<3, TestType>>();
+}
+
+/**
+ * This test checks that radix operations can invert bits (`~`) of fundamental types.
+ *
+ *    src: 1 1 0 0 1 1 0 0 1 1
+ *    dst: 0 0 1 1 0 0 1 1 0 0
+ *
+ */
+CUB_TEST("Radix operations inverse fundamental types", "[radix][operations]", fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using extractor_t  = fundamental_extractor_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  auto decomposer            = decomposer_t{};
+
+  key_t val{};
+  thrust::host_vector<char> output_buffer_mem(sizeof(key_t));
+  thrust::host_vector<char> input_buffer_mem = get_random_buffer<key_t>();
+
+  char *output_buffer = thrust::raw_pointer_cast(output_buffer_mem.data());
+  char *input_buffer  = thrust::raw_pointer_cast(input_buffer_mem.data());
+  std::memcpy(&val, input_buffer, sizeof(key_t));
+
+  for (std::size_t i = 0; i < input_buffer_mem.size(); i++)
+  {
+    input_buffer[i] = ~input_buffer[i];
+  }
+
+  key_t inv = traits::bit_ordered_inversion_policy::inverse(decomposer, val);
+  std::memcpy(output_buffer, &inv, sizeof(key_t));
+
+  REQUIRE(input_buffer_mem == output_buffer_mem);
+}
+
+/**
+ * This test checks that radix operations can invert bits (`~`) of aggregate types.
+ * For example, `struct custom_t { short s = 65535; float f = -42.2f; };`:
+ *
+ *      <------------ `.f` ------------><-- padding ---><---- `.s` ---->
+ *      s< exp. ><----- mantissa ------><-- padding ---><--- short ---->
+ * src: 1100000010000110011001100110011000000000000000001111111111111111
+ *      +------------------------------~                ~--------------+
+ * dst: 0011111101111001100110011001100111111111111111110000000000000000
+ *      <           <----  higher bits  /  lower bits  ---->           >
+ *
+ */
+CUB_TEST("Radix operations inverse pairs",
+         "[radix][operations]",
+         fundamental_types,
+         fundamental_types)
+{
+  using tpl_t = ::cuda::std::tuple<typename c2h::get<0, TestType>, //
+                                   typename c2h::get<1, TestType>>;
+
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+  using extractor_t  = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  auto decomposer = decomposer_t{};
+
+  tpl_t tpl{};
+  thrust::host_vector<char> input_buffer_mem = get_random_buffer<tpl_t>();
+
+  char *input_buffer = thrust::raw_pointer_cast(input_buffer_mem.data());
+  buffer_to_tpl(input_buffer, tpl);
+
+  for (std::size_t i = 0; i < input_buffer_mem.size(); i++)
+  {
+    input_buffer[i] = ~input_buffer[i];
+  }
+
+  thrust::host_vector<char> output_buffer_mem = input_buffer_mem;
+  char *output_buffer                         = thrust::raw_pointer_cast(output_buffer_mem.data());
+
+  tpl_t inv = traits::bit_ordered_inversion_policy::inverse(decomposer, tpl);
+  tpl_to_buffer(output_buffer, inv);
+
+  REQUIRE(input_buffer_mem == output_buffer_mem);
+}
+
+/**
+ * This tests checks that radix operations can get a value that when converted
+ * to binary-comparable representation, yields smallest possible value.
+ */
+CUB_TEST("Radix operations infere minimal value for fundamental types",
+         "[radix][operations]",
+         fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  thrust::host_vector<char> output_buffer_mem(sizeof(key_t));
+  thrust::host_vector<char> input_buffer_mem(sizeof(key_t));
+
+  key_t ref = std::numeric_limits<key_t>::lowest();
+  key_t val = traits::min_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+CUB_TEST("Radix operations infere minimal value for pair types",
+         "[radix][operations]",
+         fundamental_types,
+         fundamental_types)
+{
+  using tpl_t = ::cuda::std::tuple<typename c2h::get<0, TestType>, //
+                                   typename c2h::get<1, TestType>>;
+
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+
+  tpl_t ref;
+  tpl_to_min(ref);
+
+  tpl_t val = traits::min_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+/**
+ * This tests checks that radix operations can get a value that when converted
+ * to binary-comparable representation, yields largest possible value.
+ */
+CUB_TEST("Radix operations infere maximal value for fundamental types",
+         "[radix][operations]",
+         fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  key_t ref = std::numeric_limits<key_t>::max();
+  key_t val = traits::max_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+CUB_TEST("Radix operations infere maximal value for pair types",
+         "[radix][operations]",
+         fundamental_types,
+         fundamental_types)
+{
+  using tpl_t = ::cuda::std::tuple<typename c2h::get<0, TestType>, //
+                                   typename c2h::get<1, TestType>>;
+
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+
+  tpl_t ref;
+  tpl_to_max(ref);
+
+  tpl_t val = traits::max_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+using fundamental_signed_types =
+  c2h::type_list<std::int8_t, std::int16_t, std::int32_t, std::int64_t>;
+
+/**
+ * This tests checks that radix operations can convert a value to a binary-comparable 
+ * represetation. For example, `42.0f` is larger than `-42.0f`, but if we look at the
+ * binary representation, it's not the case because of the sign bit:
+ *
+ *         s< exp. ><----- mantissa ------> 
+ *  42.0f: 01000010001010000000000000000000
+ * -42.0f: 11000010001010000000000000000000
+ *
+ */
+CUB_TEST("Radix operations reorder values for pair types",
+         "[radix][operations]",
+         fundamental_signed_types,
+         fundamental_signed_types)
+{
+  using T1    = typename c2h::get<0, TestType>;
+  using UT1   = typename std::make_unsigned<T1>::type;
+  using T2    = typename c2h::get<1, TestType>;
+  using UT2   = typename std::make_unsigned<T2>::type;
+  using tpl_t = ::cuda::std::tuple<T1, T2>;
+
+  using traits            = cub::detail::radix::traits_t<tpl_t>;
+  using conversion_policy = typename traits::bit_ordered_conversion_policy;
+  using decomposer_t      = tuple_decomposer_t<tpl_t>;
+
+  std::bitset<sizeof(T1) * CHAR_BIT> bs_1;
+  std::bitset<sizeof(T2) * CHAR_BIT> bs_2;
+
+  // 10000(0)
+  bs_1.set(sizeof(T1) * CHAR_BIT - 1);
+  bs_2.set(sizeof(T2) * CHAR_BIT - 1);
+
+  UT1 ul_1 = static_cast<UT1>(bs_1.to_ullong());
+  UT2 ul_2 = static_cast<UT2>(bs_2.to_ullong());
+
+  T1 l_1 = reinterpret_cast<T1 &>(ul_1);
+  T2 l_2 = reinterpret_cast<T2 &>(ul_2);
+
+  REQUIRE(l_1 == std::numeric_limits<T1>::lowest());
+  REQUIRE(l_2 == std::numeric_limits<T2>::lowest());
+
+  {
+    tpl_t ref{T1{0}, T2{0}};
+    const tpl_t unordered_val = tpl_t{l_1, l_2};
+    const tpl_t ordered_val   = conversion_policy::to_bit_ordered(decomposer_t{}, unordered_val);
+
+    REQUIRE(ref == ordered_val);
+
+    const tpl_t restored_val = conversion_policy::from_bit_ordered(decomposer_t{}, ordered_val);
+    REQUIRE(restored_val == unordered_val);
+  }
+
+  ul_1 = static_cast<UT1>(std::numeric_limits<T1>::max());
+  ul_2 = static_cast<UT2>(std::numeric_limits<T2>::max());
+
+  l_1 = reinterpret_cast<T1 &>(ul_1);
+  l_2 = reinterpret_cast<T2 &>(ul_2);
+
+  bs_1 = ul_1;
+  bs_2 = ul_2;
+
+  REQUIRE_FALSE(bs_1[sizeof(T1) * CHAR_BIT - 1]);
+  REQUIRE_FALSE(bs_2[sizeof(T2) * CHAR_BIT - 1]);
+
+  {
+    const tpl_t unordered_val = tpl_t{l_1, l_2};
+    const tpl_t ordered_val   = conversion_policy::to_bit_ordered(decomposer_t{}, unordered_val);
+
+    ul_1 = reinterpret_cast<const UT1 &>(::cuda::std::get<0>(ordered_val));
+    ul_2 = reinterpret_cast<const UT2 &>(::cuda::std::get<1>(ordered_val));
+
+    REQUIRE(ul_1 == std::numeric_limits<UT1>::max());
+    REQUIRE(ul_2 == std::numeric_limits<UT2>::max());
+
+    const tpl_t restored_val = conversion_policy::from_bit_ordered(decomposer_t{}, ordered_val);
+    REQUIRE(restored_val == unordered_val);
+  }
+}
+
+struct fp_aggregate_t
+{
+  double fp64;
+  float fp32;
+};
+
+struct fp_aggregate_decomposer_t
+{
+  __host__ __device__ ::cuda::std::tuple<double &, float &> operator()(fp_aggregate_t &val) const
+  {
+    return {val.fp64, val.fp32};
+  }
+};
+
+struct flipped_fp_aggregate_decomposer_t
+{
+  __host__ __device__ ::cuda::std::tuple<float &, double &> operator()(fp_aggregate_t &val) const
+  {
+    return {val.fp32, val.fp64};
+  }
+};
+
+/**
+ * This tests checks radix sort guarantees to treat +0/-0 as the same value.
+ */
+TEST_CASE("Radix operations treat -0/+0 as being equal", "[radix][operations]")
+{
+  using traits            = cub::detail::radix::traits_t<fp_aggregate_t>;
+  using conversion_policy = typename traits::bit_ordered_conversion_policy;
+  using decomposer_t      = fp_aggregate_decomposer_t;
+  using extractor_t       = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  fp_aggregate_t negative{-0.0, -0.0f};
+  fp_aggregate_t positive{+0.0, +0.0f};
+  fp_aggregate_t ordered_negative = conversion_policy::to_bit_ordered(decomposer_t{}, negative);
+  fp_aggregate_t ordered_positibe = conversion_policy::to_bit_ordered(decomposer_t{}, positive);
+
+  const int num_bits       = CHAR_BIT;
+
+  for (int bit = 0; bit < 8; bit += num_bits)
+  {
+    auto extractor = traits::digit_extractor<extractor_t>(bit, num_bits, decomposer_t{});
+
+    const std::uint32_t digit_positive = extractor.Digit(ordered_positibe);
+    const std::uint32_t digit_negative = extractor.Digit(ordered_negative);
+
+    REQUIRE(digit_positive == digit_negative);
+  }
+}
+
+/**
+ * This tests checks that radix operations respect the order of fields in the 
+ * tuple instead of looking at the binary key representation.
+ */
+TEST_CASE("Radix operations allow fields permutation", "[radix][operations]")
+{
+  using traits            = cub::detail::radix::traits_t<fp_aggregate_t>;
+  using conversion_policy = typename traits::bit_ordered_conversion_policy;
+  using decomposer_t      = flipped_fp_aggregate_decomposer_t;
+  using extractor_t       = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  fp_aggregate_t lhs{4.2, 2.4f};
+  fp_aggregate_t rhs{2.4, 4.2f};
+
+  REQUIRE(::cuda::std::tie(lhs.fp64, lhs.fp32) > cuda::std::tie(rhs.fp64, rhs.fp32));
+
+  fp_aggregate_t ordered_lhs = conversion_policy::to_bit_ordered(decomposer_t{}, lhs);
+  fp_aggregate_t ordered_rhs = conversion_policy::to_bit_ordered(decomposer_t{}, lhs);
+
+  const int num_bits       = CHAR_BIT;
+  const int aggregate_bits = static_cast<int>(sizeof(float) + sizeof(double)) * CHAR_BIT;
+
+  for (int current_bit = aggregate_bits - num_bits; current_bit >= 0; current_bit -= num_bits)
+  {
+    auto extractor = traits::digit_extractor<extractor_t>(current_bit, num_bits, decomposer_t{});
+
+    const std::uint32_t digit_lhs = extractor.Digit(ordered_lhs);
+    const std::uint32_t digit_rhs = extractor.Digit(ordered_rhs);
+
+    if (digit_lhs == digit_rhs)
+    {
+      continue;
+    }
+
+    std::bitset<32> bs_lhs(digit_lhs);
+    std::bitset<32> bs_rhs(digit_rhs);
+
+    for (int bit = 31; bit >= 0; bit--)
+    {
+      REQUIRE_FALSE(bs_lhs[bit]);
+      if (bs_rhs[bit])
+      {
+        return;
+      }
+    }
+  }
+}
diff --git a/include/cub/test/catch2_test_util_type.cu b/include/cub/test/catch2_test_util_type.cu
new file mode 100644
index 0000000..1604580
--- /dev/null
+++ b/include/cub/test/catch2_test_util_type.cu
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+CUB_TEST("Tests non_void_value_t", "[util][type]")
+{
+  using fallback_t        = float;
+  using void_fancy_it     = cub::DiscardOutputIterator<std::size_t>;
+  using non_void_fancy_it = cub::CountingInputIterator<int>;
+
+  // falls back for const void*
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<fallback_t, //
+                         cub::detail::non_void_value_t<const void *, fallback_t>>::value);
+  // falls back for const volatile void*
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<fallback_t, //
+                         cub::detail::non_void_value_t<const volatile void *, fallback_t>>::value);
+  // falls back for volatile void*
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<fallback_t, //
+                         cub::detail::non_void_value_t<volatile void *, fallback_t>>::value);
+  // falls back for void*
+  STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
+                                      cub::detail::non_void_value_t<void *, fallback_t>>::value);
+  // works for int*
+  STATIC_REQUIRE(::cuda::std::is_same<int, //
+                                      cub::detail::non_void_value_t<int *, void>>::value);
+  // falls back for fancy iterator with a void value type
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<fallback_t, //
+                         cub::detail::non_void_value_t<void_fancy_it, fallback_t>>::value);
+  // works for a fancy iterator that has int as value type
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<int, //
+                         cub::detail::non_void_value_t<non_void_fancy_it, fallback_t>>::value);
+}
diff --git a/include/cub/test/catch2_test_warp_exchange.cu b/include/cub/test/catch2_test_warp_exchange.cu
new file mode 100644
index 0000000..6987aec
--- /dev/null
+++ b/include/cub/test/catch2_test_warp_exchange.cu
@@ -0,0 +1,354 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/warp/warp_exchange.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/reverse.h>
+#include <thrust/sequence.h>
+
+#include <type_traits>
+
+#include "fill_striped.cuh"
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <typename InputT, typename OutputT, int ItemsPerThread, typename = void>
+struct exchange_data_t;
+
+template <typename InputT, typename OutputT, int ItemsPerThread>
+struct exchange_data_t<InputT,
+                       OutputT,
+                       ItemsPerThread,
+                       typename std::enable_if<std::is_same<InputT, OutputT>::value>::type>
+{
+  InputT input[ItemsPerThread];
+  OutputT (&output)[ItemsPerThread] = input;
+
+  template <int LogicalWarpThreads>
+  inline __device__ void
+  scatter(cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads> &exchange,
+          int (&ranks)[ItemsPerThread])
+  {
+    exchange.ScatterToStriped(input, ranks);
+  }
+};
+
+template <typename InputT, typename OutputT, int ItemsPerThread>
+struct exchange_data_t<InputT,
+                       OutputT,
+                       ItemsPerThread,
+                       typename std::enable_if<!std::is_same<InputT, OutputT>::value>::type>
+{
+  InputT input[ItemsPerThread];
+  OutputT output[ItemsPerThread];
+
+  template <int LogicalWarpThreads>
+  inline __device__ void
+  scatter(cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads> &exchange,
+          int (&ranks)[ItemsPerThread])
+  {
+    exchange.ScatterToStriped(input, output, ranks);
+  }
+};
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename InputT,
+          typename OutputT>
+__global__ void scatter_kernel(const InputT *input_data, OutputT *output_data)
+{
+  using warp_exchange_t = cub::WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
+  using storage_t       = typename warp_exchange_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+  __shared__ storage_t temp_storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+  const int lane_id = tid % LOGICAL_WARP_THREADS;
+
+  warp_exchange_t exchange(temp_storage[warp_id]);
+
+  exchange_data_t<InputT, OutputT, ITEMS_PER_THREAD> exchange_data;
+
+  // Reverse data
+  int ranks[ITEMS_PER_THREAD];
+
+  input_data += warp_id * tile_size;
+  output_data += warp_id * tile_size;
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const auto item_idx       = lane_id * ITEMS_PER_THREAD + item;
+    exchange_data.input[item] = input_data[item_idx];
+    ranks[item]               = tile_size - 1 - item_idx;
+  }
+
+  exchange_data.scatter(exchange, ranks);
+
+  // Striped to blocked
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    output_data[item * LOGICAL_WARP_THREADS + lane_id] = exchange_data.output[item];
+  }
+}
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename InputT,
+          typename OutputT>
+void warp_scatter_strided(thrust::device_vector<InputT> &in, thrust::device_vector<OutputT> &out)
+{
+  scatter_kernel<LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, TOTAL_WARPS, InputT, OutputT>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(thrust::raw_pointer_cast(in.data()),
+                                                thrust::raw_pointer_cast(out.data()));
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename InputT,
+          typename OutputT,
+          typename ActionT>
+__global__ void kernel(const InputT *input_data, OutputT *output_data, ActionT action)
+{
+  using warp_exchange_t = cub::WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
+  using storage_t       = typename warp_exchange_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+  __shared__ storage_t temp_storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+  const int lane_id = tid % LOGICAL_WARP_THREADS;
+
+  warp_exchange_t exchange(temp_storage[warp_id]);
+
+  exchange_data_t<InputT, OutputT, ITEMS_PER_THREAD> exchange_data;
+
+  input_data += warp_id * tile_size;
+  output_data += warp_id * tile_size;
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    exchange_data.input[item] = input_data[lane_id * ITEMS_PER_THREAD + item];
+  }
+
+  action(exchange_data.input, exchange_data.output, exchange);
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    output_data[lane_id * ITEMS_PER_THREAD + item] = exchange_data.output[item];
+  }
+}
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename InputT,
+          typename OutputT,
+          typename ActionT>
+void warp_exchange(thrust::device_vector<InputT> &in,
+                   thrust::device_vector<OutputT> &out,
+                   ActionT action)
+{
+  kernel<LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, TOTAL_WARPS, InputT, OutputT, ActionT>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(thrust::raw_pointer_cast(in.data()),
+                                                thrust::raw_pointer_cast(out.data()),
+                                                action);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+struct blocked_to_striped
+{
+  template <typename InputT,
+            typename OutputT,
+            int LogicalWarpThreads,
+            int ItemsPerThread,
+            int ITEMS_PER_THREAD>
+  __device__ void operator()(InputT (&input)[ITEMS_PER_THREAD],
+                             OutputT (&output)[ITEMS_PER_THREAD],
+                             cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads> &exchange)
+  {
+    exchange.BlockedToStriped(input, output);
+  }
+};
+
+struct striped_to_blocked
+{
+  template <typename InputT,
+            typename OutputT,
+            int LogicalWarpThreads,
+            int ItemsPerThread,
+            int ITEMS_PER_THREAD>
+  __device__ void operator()(InputT (&input)[ITEMS_PER_THREAD],
+                             OutputT (&output)[ITEMS_PER_THREAD],
+                             cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads> &exchange)
+  {
+    exchange.StripedToBlocked(input, output);
+  }
+};
+
+template <typename T>
+thrust::host_vector<T> compute_host_reference(const thrust::device_vector<T> &d_input,
+                                              int tile_size)
+{
+  thrust::host_vector<T> input = d_input;
+
+  int num_warps = CUB_QUOTIENT_CEILING(static_cast<int>(d_input.size()), tile_size);
+  for (int warp_id = 0; warp_id < num_warps; warp_id++)
+  {
+    const int warp_data_begin = tile_size * warp_id;
+    const int warp_data_end   = warp_data_begin + tile_size;
+    thrust::reverse(input.begin() + warp_data_begin, input.begin() + warp_data_end);
+  }
+  return input;
+}
+
+using inout_types = c2h::type_list<c2h::pair<std::uint16_t, std::int64_t>,
+                                   c2h::pair<std::uint16_t, std::uint32_t>,
+                                   c2h::pair<std::int32_t, std::int32_t>,
+                                   c2h::pair<std::int64_t, std::int64_t>,
+                                   c2h::pair<uchar3, uchar3>,
+                                   c2h::pair<ulonglong4, ulonglong4>>;
+
+using logical_warp_threads = c2h::enum_type_list<int, 4, 16, 32>;
+using items_per_thread     = c2h::enum_type_list<int, 1, 4, 7>;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value() { return total_warps; }
+};
+
+template <class TestType>
+struct params_t
+{
+  using in_type  = typename c2h::first<c2h::get<0, TestType>>;
+  using out_type = typename c2h::second<c2h::get<0, TestType>>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread     = c2h::get<2, TestType>::value;
+  static constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size            = logical_warp_threads * items_per_thread;
+  static constexpr int total_item_count     = total_warps * tile_size;
+};
+
+CUB_TEST("Scatter to striped works",
+         "[exchange][warp]",
+         inout_types,
+         logical_warp_threads,
+         items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  thrust::device_vector<out_type> d_out(params::total_item_count);
+  thrust::device_vector<in_type> d_in(params::total_item_count);
+
+  c2h::gen(c2h::modulo_t{d_in.size()}, d_in);
+
+  warp_scatter_strided<params::logical_warp_threads, params::items_per_thread, params::total_warps>(
+    d_in,
+    d_out);
+
+  auto h_expected_output = compute_host_reference(d_in, params::tile_size);
+  REQUIRE(h_expected_output == d_out);
+}
+
+CUB_TEST("Blocked to striped works",
+         "[exchange][warp]",
+         inout_types,
+         logical_warp_threads,
+         items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  thrust::device_vector<out_type> d_out(params::total_item_count, out_type{});
+  thrust::device_vector<in_type> d_in(params::total_item_count);
+
+  c2h::gen(c2h::modulo_t{d_in.size()}, d_in);
+
+  warp_exchange<params::logical_warp_threads, params::items_per_thread, params::total_warps>(
+    d_in,
+    d_out,
+    blocked_to_striped{});
+  thrust::host_vector<out_type> h_expected_output(d_out.size());
+  fill_striped<params::logical_warp_threads,
+               params::items_per_thread,
+               params::logical_warp_threads * params::total_warps>(h_expected_output.begin());
+
+  REQUIRE(h_expected_output == d_out);
+}
+
+CUB_TEST("Striped to blocked works",
+         "[exchange][warp]",
+         inout_types,
+         logical_warp_threads,
+         items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  thrust::device_vector<out_type> d_out(params::total_item_count, out_type{});
+
+  thrust::host_vector<in_type> h_in(params::total_item_count);
+  fill_striped<params::logical_warp_threads,
+               params::items_per_thread,
+               params::logical_warp_threads * params::total_warps>(h_in.begin());
+  thrust::device_vector<in_type> d_in = h_in;
+
+  warp_exchange<params::logical_warp_threads, params::items_per_thread, params::total_warps>(
+    d_in,
+    d_out,
+    striped_to_blocked{});
+  thrust::device_vector<out_type> d_expected_output(d_out.size());
+  c2h::gen(c2h::modulo_t{d_out.size()}, d_expected_output);
+
+  REQUIRE(d_expected_output == d_out);
+}
diff --git a/include/cub/test/catch2_test_warp_load.cu b/include/cub/test/catch2_test_warp_load.cu
new file mode 100644
index 0000000..1bd6bbb
--- /dev/null
+++ b/include/cub/test/catch2_test_warp_load.cu
@@ -0,0 +1,387 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/detail/cpp_compatibility.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/warp/warp_load.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include "fill_striped.cuh"
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <cub::WarpLoadAlgorithm LoadAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename InputIteratorT,
+          typename ActionT>
+__global__ void warp_load_kernel(InputIteratorT input_iterator, ActionT action, int *error_counter)
+{
+  using warp_load_t = cub::WarpLoad<T, ITEMS_PER_THREAD, LoadAlgorithm, LOGICAL_WARP_THREADS>;
+  using storage_t   = typename warp_load_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int linear_tid = threadIdx.x;
+
+  const int warp_id = linear_tid / LOGICAL_WARP_THREADS;
+  warp_load_t load(storage[warp_id]);
+
+  // Test WarpLoad specialization
+  T reg[ITEMS_PER_THREAD];
+  action.load(load, input_iterator + (warp_id * tile_size), reg);
+
+  // Verify data was loaded as expected
+  action.verify(reg, error_counter);
+}
+
+template <cub::WarpLoadAlgorithm LoadAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename InputIteratorT,
+          typename ActionT>
+void warp_load(InputIteratorT input_iterator, ActionT action, int *error_counter)
+{
+  warp_load_kernel<LoadAlgorithm,
+                   LOGICAL_WARP_THREADS,
+                   ITEMS_PER_THREAD,
+                   TOTAL_WARPS,
+                   T,
+                   InputIteratorT,
+                   ActionT>
+    <<<1, TOTAL_WARPS * LOGICAL_WARP_THREADS>>>(input_iterator, action, error_counter);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief WarpLoad test specialisation for guarded loads
+ */
+template <cub::WarpLoadAlgorithm LoadAlgorithm, int LOGICAL_WARP_THREADS, typename T>
+struct guarded_load_t
+{
+  int valid_items;
+  T oob_default;
+
+  template <int ITEMS_PER_THREAD, typename InputIteratorT>
+  __device__ void load(cub::WarpLoad<T, ITEMS_PER_THREAD, LoadAlgorithm, LOGICAL_WARP_THREADS> load,
+                       InputIteratorT input,
+                       T (&reg)[ITEMS_PER_THREAD])
+  {
+    load.Load(input, reg, valid_items, oob_default);
+  }
+
+  template <int ITEMS_PER_THREAD>
+  __device__ void verify(T (&reg)[ITEMS_PER_THREAD], int *error_counter)
+  {
+    const auto linear_tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+    const auto lane_id    = linear_tid % LOGICAL_WARP_THREADS;
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const auto expected_value = static_cast<T>(linear_tid * ITEMS_PER_THREAD + item);
+
+      const bool is_oob = LoadAlgorithm == cub::WarpLoadAlgorithm::WARP_LOAD_STRIPED
+                            ? item * LOGICAL_WARP_THREADS + lane_id >= valid_items
+                            : lane_id * ITEMS_PER_THREAD + item >= valid_items;
+
+      if (is_oob)
+      {
+        if (reg[item] != oob_default)
+        {
+          atomicAdd(error_counter, 1);
+        }
+      }
+      else if (reg[item] != expected_value)
+      {
+        atomicAdd(error_counter, 1);
+      }
+    }
+  }
+};
+
+/**
+ * @brief WarpLoad test specialisation for unguarded loads
+ */
+struct unguarded_load_t
+{
+  template <cub::WarpLoadAlgorithm LoadAlgorithm,
+            int LOGICAL_WARP_THREADS,
+            int ITEMS_PER_THREAD,
+            typename T,
+            typename InputIteratorT>
+  __device__ void load(cub::WarpLoad<T, ITEMS_PER_THREAD, LoadAlgorithm, LOGICAL_WARP_THREADS> load,
+                       InputIteratorT input,
+                       T (&reg)[ITEMS_PER_THREAD])
+  {
+    load.Load(input, reg);
+  }
+
+  template <typename T, int ITEMS_PER_THREAD>
+  __device__ void verify(T (&reg)[ITEMS_PER_THREAD], int *error_counter)
+  {
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const auto expected_value = static_cast<T>(threadIdx.x * ITEMS_PER_THREAD + item);
+
+      if (reg[item] != expected_value)
+      {
+        atomicAdd(error_counter, 1);
+      }
+    }
+  }
+};
+
+template <cub::WarpLoadAlgorithm LoadAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T>
+thrust::device_vector<T> generate_input()
+{
+  const int tile_size = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD;
+  const int num_items = TOTAL_WARPS * tile_size;
+
+  thrust::device_vector<T> d_input(num_items);
+
+  if (LoadAlgorithm == cub::WarpLoadAlgorithm::WARP_LOAD_STRIPED)
+  {
+    thrust::host_vector<T> h_input(num_items);
+
+    // In this case we need different stripe pattern, so the
+    // items/threads parameters are swapped
+
+    constexpr int FAKE_BLOCK_SIZE = ITEMS_PER_THREAD * TOTAL_WARPS;
+
+    fill_striped<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, FAKE_BLOCK_SIZE>(h_input.begin());
+    d_input = h_input;
+  }
+  else
+  {
+    c2h::gen(c2h::modulo_t{num_items}, d_input);
+  }
+
+  return d_input;
+}
+
+// %PARAM% LWT lwt 4:16:32
+// %PARAM% ALGO_TYPE alg 0:1:2:3
+
+using types            = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using items_per_thread = c2h::enum_type_list<int, 1, 4, 7>;
+using logical_warp_threads = c2h::enum_type_list<int, LWT>;
+using algorithms           = c2h::enum_type_list<cub::WarpLoadAlgorithm,
+                                       cub::WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       cub::WarpLoadAlgorithm::WARP_LOAD_STRIPED,
+                                       cub::WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                       cub::WarpLoadAlgorithm::WARP_LOAD_VECTORIZE>;
+using algorithm =
+  c2h::enum_type_list<cub::WarpLoadAlgorithm, c2h::get<ALGO_TYPE, algorithms>::value>;
+
+using cache_load_modifier = c2h::enum_type_list<cub::CacheLoadModifier,
+                                                cub::CacheLoadModifier::LOAD_DEFAULT,
+                                                cub::CacheLoadModifier::LOAD_CA,
+                                                cub::CacheLoadModifier::LOAD_CG,
+                                                cub::CacheLoadModifier::LOAD_CS,
+                                                cub::CacheLoadModifier::LOAD_CV,
+                                                cub::CacheLoadModifier::LOAD_LDG,
+                                                cub::CacheLoadModifier::LOAD_VOLATILE>;
+
+constexpr int guarded_load_tests_count = 30;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value() { return total_warps; }
+};
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads         = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread             = c2h::get<2, TestType>::value;
+  static constexpr cub::WarpLoadAlgorithm algorithm = c2h::get<3, TestType>::value;
+  static constexpr int total_warps                  = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size                    = logical_warp_threads * items_per_thread;
+  static constexpr int total_item_count             = total_warps * tile_size;
+};
+
+CUB_TEST("Warp load guarded range works with pointer",
+         "[load][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm)
+{
+  using params     = params_t<TestType>;
+  using type       = typename params::type;
+  using delegate_t = guarded_load_t<params::algorithm, params::logical_warp_threads, type>;
+
+  const int valid_items =
+    GENERATE_COPY(take(guarded_load_tests_count, random(0, params::tile_size - 1)));
+  const auto oob_default = static_cast<type>(valid_items);
+
+  auto d_in = generate_input<params::algorithm,
+                             params::logical_warp_threads,
+                             params::items_per_thread,
+                             params::total_warps,
+                             type>();
+  thrust::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm,
+            params::logical_warp_threads,
+            params::items_per_thread,
+            params::total_warps,
+            type>(thrust::raw_pointer_cast(d_in.data()),
+                  delegate_t{valid_items, oob_default},
+                  thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const int num_errors           = d_error_counter[0];
+  const int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
+
+CUB_TEST("Warp load guarded range works with cache modified iterator",
+         "[load][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_load_modifier)
+{
+  using params     = params_t<TestType>;
+  using type       = typename params::type;
+  using delegate_t = guarded_load_t<params::algorithm, params::logical_warp_threads, type>;
+  constexpr cub::CacheLoadModifier load_modifier = c2h::get<4, TestType>::value;
+
+  const int valid_items =
+    GENERATE_COPY(take(guarded_load_tests_count, random(0, params::tile_size - 1)));
+  const auto oob_default = static_cast<type>(valid_items);
+
+  auto d_in = generate_input<params::algorithm,
+                             params::logical_warp_threads,
+                             params::items_per_thread,
+                             params::total_warps,
+                             type>();
+  auto in_it =
+    cub::CacheModifiedInputIterator<load_modifier, type>(thrust::raw_pointer_cast(d_in.data()));
+  thrust::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm,
+            params::logical_warp_threads,
+            params::items_per_thread,
+            params::total_warps,
+            type>(in_it,
+                  delegate_t{valid_items, oob_default},
+                  thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const auto num_errors          = d_error_counter[0];
+  const int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
+
+CUB_TEST("Warp load unguarded range works with pointer",
+         "[load][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm)
+{
+  using params     = params_t<TestType>;
+  using type       = typename params::type;
+  using delegate_t = unguarded_load_t;
+
+  auto d_in = generate_input<params::algorithm,
+                             params::logical_warp_threads,
+                             params::items_per_thread,
+                             params::total_warps,
+                             type>();
+  thrust::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm,
+            params::logical_warp_threads,
+            params::items_per_thread,
+            params::total_warps,
+            type>(thrust::raw_pointer_cast(d_in.data()),
+                  delegate_t{},
+                  thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const auto num_errors          = d_error_counter[0];
+  const int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
+
+CUB_TEST("Warp load unguarded range works with cache modified iterator",
+         "[load][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_load_modifier)
+{
+  using params                                   = params_t<TestType>;
+  using type                                     = typename params::type;
+  using delegate_t                               = unguarded_load_t;
+  constexpr cub::CacheLoadModifier load_modifier = c2h::get<4, TestType>::value;
+
+  auto d_in = generate_input<params::algorithm,
+                             params::logical_warp_threads,
+                             params::items_per_thread,
+                             params::total_warps,
+                             type>();
+  auto in_it =
+    cub::CacheModifiedInputIterator<load_modifier, type>(thrust::raw_pointer_cast(d_in.data()));
+  thrust::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm,
+            params::logical_warp_threads,
+            params::items_per_thread,
+            params::total_warps,
+            type>(in_it, delegate_t{}, thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const auto num_errors          = d_error_counter[0];
+  const int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
diff --git a/include/cub/test/catch2_test_warp_mask.cu b/include/cub/test/catch2_test_warp_mask.cu
new file mode 100644
index 0000000..cf9450c
--- /dev/null
+++ b/include/cub/test/catch2_test_warp_mask.cu
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_ptx.cuh>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr unsigned int total_warps = (cub::PowerOfTwo<logical_warp_threads>::VALUE)
+                                                ? CUB_WARP_THREADS(0) / logical_warp_threads
+                                                : 1;
+
+public:
+  static constexpr unsigned int value() { return total_warps; }
+};
+
+bool is_lane_involved(unsigned int member_mask, unsigned int lane)
+{
+  return member_mask & (1 << lane);
+}
+
+using logical_warp_threads      = c2h::iota<1, 32>;
+using power_of_two_warp_threads = c2h::enum_type_list<int, 1, 2, 4, 8, 16, 32>;
+
+CUB_TEST("Warp mask ignores lanes before current logical warp",
+         "[mask][warp]",
+         power_of_two_warp_threads)
+{
+  constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
+  constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
+
+  for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
+  {
+    const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
+    const unsigned int warp_begin = logical_warp_thread * warp_id;
+
+    for (unsigned int prev_warp_lane = 0; prev_warp_lane < warp_begin; prev_warp_lane++)
+    {
+      REQUIRE_FALSE(is_lane_involved(warp_mask, prev_warp_lane));
+    }
+  }
+}
+
+CUB_TEST("Warp mask involves lanes of current logical warp", "[mask][warp]", logical_warp_threads)
+{
+  constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
+  constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
+
+  for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
+  {
+    const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
+    const unsigned int warp_begin = logical_warp_thread * warp_id;
+    const unsigned int warp_end   = warp_begin + logical_warp_thread;
+
+    for (unsigned int warp_lane = warp_begin; warp_lane < warp_end; warp_lane++)
+    {
+      REQUIRE(is_lane_involved(warp_mask, warp_lane));
+    }
+  }
+}
+
+CUB_TEST("Warp mask ignores lanes after current logical warp", "[mask][warp]", logical_warp_threads)
+{
+  constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
+  constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
+
+  for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
+  {
+    const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
+    const unsigned int warp_begin = logical_warp_thread * warp_id;
+    const unsigned int warp_end   = warp_begin + logical_warp_thread;
+
+    for (unsigned int post_warp_lane = warp_end; post_warp_lane < CUB_WARP_THREADS(0);
+         post_warp_lane++)
+    {
+      REQUIRE_FALSE(is_lane_involved(warp_mask, post_warp_lane));
+    }
+  }
+}
diff --git a/include/cub/test/catch2_test_warp_merge_sort.cu b/include/cub/test/catch2_test_warp_merge_sort.cu
new file mode 100644
index 0000000..0df4905
--- /dev/null
+++ b/include/cub/test/catch2_test_warp_merge_sort.cu
@@ -0,0 +1,594 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <algorithm>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "c2h/custom_type.cuh"
+#include "catch2_test_helper.h"
+
+struct CustomLess
+{
+  template <typename T>
+  __device__ __host__ bool operator()(const T &lhs, const T &rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+/**
+ * @brief Kernel to dispatch to the appropriate WarpMergeSort member function, sorting keys-only.
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename T,
+          typename SegmentSizeItT,
+          typename ActionT>
+__global__ void
+warp_merge_sort_kernel(T *in, T *out, SegmentSizeItT segment_sizes, T oob_default, ActionT action)
+{
+  using warp_merge_sort_t = cub::WarpMergeSort<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
+  using storage_t         = typename warp_merge_sort_t::TempStorage;
+
+  // Get linear thread and warp index
+  const int tid     = threadIdx.x;
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  // Test case of partially finished CTA
+  if (warp_id >= TOTAL_WARPS)
+  {
+    return;
+  }
+
+  // Thread-local storage & warp-scope temporary storage allocation
+  T thread_data[ITEMS_PER_THREAD];
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  // Instantiate warp-scope algorithm
+  warp_merge_sort_t warp_sort(storage[warp_id]);
+
+  const int warp_offset   = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD * warp_id;
+  const int thread_offset = warp_offset + warp_sort.get_linear_tid() * ITEMS_PER_THREAD;
+  const int valid_items   = segment_sizes[warp_id];
+
+  // Load data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_data[item] = in[idx];
+  }
+  cub::WARP_SYNC(warp_sort.get_member_mask());
+
+  // Run merge sort test
+  action(warp_sort, thread_data, valid_items, oob_default);
+
+  // Store data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx = thread_offset + item;
+    out[idx]      = (idx - warp_offset) >= valid_items ? oob_default : thread_data[item];
+  }
+}
+
+/**
+ * @brief Kernel to dispatch to the appropriate WarpMergeSort member function, sorting key-value
+ * pairs.
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename KeyT,
+          typename ValueT,
+          typename SegmentSizeItT,
+          typename ActionT>
+__global__ void warp_merge_sort_kernel(KeyT *keys_in,
+                                       KeyT *keys_out,
+                                       ValueT *values_in,
+                                       ValueT *values_out,
+                                       SegmentSizeItT segment_sizes,
+                                       KeyT oob_default,
+                                       ActionT action)
+{
+  using warp_merge_sort_t =
+    cub::WarpMergeSort<KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT>;
+  using storage_t = typename warp_merge_sort_t::TempStorage;
+
+  // Get linear thread and warp index
+  const int tid     = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  // Test case of partially finished CTA
+  if (warp_id >= TOTAL_WARPS)
+  {
+    return;
+  }
+
+  // Thread-local storage & warp-scope temporary storage allocation
+  KeyT keys[ITEMS_PER_THREAD];
+  ValueT values[ITEMS_PER_THREAD];
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  // Instantiate warp-scope algorithm
+  warp_merge_sort_t warp_sort(storage[warp_id]);
+
+  const int warp_offset   = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD * warp_id;
+  const int thread_offset = warp_offset + warp_sort.get_linear_tid() * ITEMS_PER_THREAD;
+  const int valid_items   = segment_sizes[warp_id];
+
+  // Load data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx = thread_offset + item;
+    keys[item]    = keys_in[idx];
+    values[item]  = values_in[idx];
+  }
+  cub::WARP_SYNC(warp_sort.get_member_mask());
+
+  // Run merge sort test
+  action(warp_sort, keys, values, valid_items, oob_default);
+
+  // Store data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx   = thread_offset + item;
+    keys_out[idx]   = (idx - warp_offset) >= valid_items ? oob_default : keys[item];
+    values_out[idx] = (idx - warp_offset) >= valid_items ? ValueT{} : values[item];
+  }
+}
+
+// -----------------------------------------------------------
+// Dimensions being instantiated:
+// {full,partial} x {stable, 'unstable'} x {keys, kv-pairs}
+// -----------------------------------------------------------
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::StableSort on keys-only
+ */
+struct warp_stable_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             T (&thread_data)[ITEMS_PER_THREAD],
+                             int /*valid_items*/,
+                             T /*oob_default*/) const
+  {
+    warp_sort.StableSort(thread_data, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort keys-only
+ */
+struct warp_partial_stable_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             T (&thread_data)[ITEMS_PER_THREAD],
+                             int valid_items,
+                             T oob_default) const
+  {
+    warp_sort.StableSort(thread_data, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::Sort on keys-only
+ */
+struct warp_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             T (&thread_data)[ITEMS_PER_THREAD],
+                             int /*valid_items*/,
+                             T /*oob_default*/) const
+  {
+    warp_sort.Sort(thread_data, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort keys-only
+ */
+struct warp_partial_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             T (&thread_data)[ITEMS_PER_THREAD],
+                             int valid_items,
+                             T oob_default) const
+  {
+    warp_sort.Sort(thread_data, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::StableSort on key-value pairs
+ */
+struct warp_stable_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             KeyT (&keys)[ITEMS_PER_THREAD],
+                             ValueT (&values)[ITEMS_PER_THREAD],
+                             int /*valid_items*/,
+                             KeyT /*oob_default*/) const
+  {
+    warp_sort.StableSort(keys, values, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort key-value pairs
+ */
+struct warp_partial_stable_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             KeyT (&keys)[ITEMS_PER_THREAD],
+                             ValueT (&values)[ITEMS_PER_THREAD],
+                             int valid_items,
+                             KeyT oob_default) const
+  {
+    warp_sort.StableSort(keys, values, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::Sort on key-value pairs
+ */
+struct warp_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             KeyT (&keys)[ITEMS_PER_THREAD],
+                             ValueT (&values)[ITEMS_PER_THREAD],
+                             int /*valid_items*/,
+                             KeyT /*oob_default*/) const
+  {
+    warp_sort.Sort(keys, values, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort key-value pairs
+ */
+struct warp_partial_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(WarpSortT &warp_sort,
+                             KeyT (&keys)[ITEMS_PER_THREAD],
+                             ValueT (&values)[ITEMS_PER_THREAD],
+                             int valid_items,
+                             KeyT oob_default) const
+  {
+    warp_sort.Sort(keys, values, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Dispatch helper function for sorting keys
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename T,
+          typename SegmentSizesItT,
+          typename ActionT>
+void warp_merge_sort(thrust::device_vector<T> &in,
+                     thrust::device_vector<T> &out,
+                     SegmentSizesItT segment_sizes,
+                     T oob_default,
+                     ActionT action)
+{
+  warp_merge_sort_kernel<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, TOTAL_WARPS>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(thrust::raw_pointer_cast(in.data()),
+                                                thrust::raw_pointer_cast(out.data()),
+                                                segment_sizes,
+                                                oob_default,
+                                                action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Dispatch helper function for sorting key-value pairs
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename KeyT,
+          typename ValueT,
+          typename SegmentSizesItT,
+          typename ActionT>
+void warp_merge_sort(thrust::device_vector<KeyT> &keys_in,
+                     thrust::device_vector<KeyT> &keys_out,
+                     thrust::device_vector<ValueT> &values_in,
+                     thrust::device_vector<ValueT> &values_out,
+                     SegmentSizesItT segment_sizes,
+                     KeyT oob_default,
+                     ActionT action)
+{
+  warp_merge_sort_kernel<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, TOTAL_WARPS>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(thrust::raw_pointer_cast(keys_in.data()),
+                                                thrust::raw_pointer_cast(keys_out.data()),
+                                                thrust::raw_pointer_cast(values_in.data()),
+                                                thrust::raw_pointer_cast(values_out.data()),
+                                                segment_sizes,
+                                                oob_default,
+                                                action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Performs a stable sort on per-warp segments of data and assigns oob_default to items that
+ * are out-of-bounds.
+ */
+template <typename RandomItT, typename SegmentSizeItT, typename T>
+void compute_host_reference(RandomItT h_data,
+                            SegmentSizeItT segment_sizes,
+                            unsigned int num_segments,
+                            T oob_default,
+                            int logical_warp_items)
+{
+  for (unsigned int segment_id = 0; segment_id < num_segments; segment_id++)
+  {
+    unsigned int segment_size = segment_sizes[segment_id];
+    std::stable_sort(h_data, h_data + segment_size);
+    std::fill(h_data + segment_size, h_data + logical_warp_items, oob_default);
+    h_data += logical_warp_items;
+  }
+}
+
+/**
+ * @brief Stability requirement of the sorting algorithm
+ */
+enum class stability
+{
+  stable,
+  unstable
+};
+
+// List of key types to test
+using custom_t =
+  c2h::custom_type_t<c2h::equal_comparable_t, c2h::lexicographical_less_comparable_t>;
+using key_types = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t, custom_t>;
+
+// List of value types
+using value_types = c2h::type_list<std::int32_t, custom_t>;
+
+// Logical warp sizes to test
+using logical_warp_threads = c2h::enum_type_list<int, 32, 4>;
+
+// Number of items per thread to test
+using items_per_thread_list = c2h::enum_type_list<int, 1, 4, 7>;
+
+// Whether the sort is required to be stable or not
+using stability_list = c2h::enum_type_list<stability, stability::stable, stability::unstable>;
+
+template <typename TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread     = c2h::get<2, TestType>::value;
+  static constexpr int logical_warp_items   = logical_warp_threads * items_per_thread;
+  static constexpr int total_warps          = 2;
+  static constexpr int tile_size            = items_per_thread * total_warps * logical_warp_threads;
+  static constexpr bool is_stable           = c2h::get<3, TestType>::value == stability::stable;
+};
+
+CUB_TEST("Warp sort on keys-only works",
+         "[sort][warp]",
+         key_types,
+         logical_warp_threads,
+         items_per_thread_list,
+         stability_list)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+  using warp_sort_delegate =
+    cub::detail::conditional_t<params::is_stable, warp_stable_sort_keys_t, warp_sort_keys_t>;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  auto segment_sizes     = thrust::make_constant_iterator(params::logical_warp_items);
+  const auto oob_default = std::numeric_limits<type>::max();
+  c2h::gen(CUB_SEED(10), d_in);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    segment_sizes,
+    oob_default,
+    warp_sort_delegate{});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in_out = d_in;
+  compute_host_reference(h_in_out.begin(),
+                         segment_sizes,
+                         params::total_warps,
+                         oob_default,
+                         params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_in_out == d_out);
+}
+
+CUB_TEST("Warp sort keys-only on partial warp-tile works",
+         "[sort][warp]",
+         key_types,
+         logical_warp_threads,
+         items_per_thread_list,
+         stability_list)
+{
+  using params             = params_t<TestType>;
+  using type               = typename params::type;
+  using warp_sort_delegate = cub::detail::
+    conditional_t<params::is_stable, warp_partial_stable_sort_keys_t, warp_partial_sort_keys_t>;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<int> d_segment_sizes(params::total_warps);
+  const auto oob_default = std::numeric_limits<type>::max();
+  c2h::gen(CUB_SEED(5), d_in);
+  c2h::gen(CUB_SEED(5), d_segment_sizes, 0, params::logical_warp_items);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    d_segment_sizes.cbegin(),
+    oob_default,
+    warp_sort_delegate{});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in_out     = d_in;
+  thrust::host_vector<int> segment_sizes = d_segment_sizes;
+  compute_host_reference(h_in_out.begin(),
+                         segment_sizes,
+                         params::total_warps,
+                         oob_default,
+                         params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_in_out == d_out);
+}
+
+CUB_TEST("Warp sort on keys-value pairs works",
+         "[sort][warp]",
+         key_types,
+         logical_warp_threads,
+         items_per_thread_list,
+         stability_list,
+         value_types)
+{
+  using params     = params_t<TestType>;
+  using key_type   = typename params::type;
+  using value_type = typename c2h::get<4, TestType>;
+  using warp_sort_delegate =
+    cub::detail::conditional_t<params::is_stable, warp_stable_sort_pairs_t, warp_sort_pairs_t>;
+
+  // Prepare test data
+  thrust::device_vector<key_type> d_keys_in(params::tile_size);
+  thrust::device_vector<key_type> d_keys_out(params::tile_size);
+  thrust::device_vector<value_type> d_values_in(params::tile_size);
+  thrust::device_vector<value_type> d_values_out(params::tile_size);
+  auto segment_sizes     = thrust::make_constant_iterator(params::logical_warp_items);
+  const auto oob_default = std::numeric_limits<key_type>::max();
+  c2h::gen(CUB_SEED(10), d_keys_in);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_keys_in,
+    d_keys_out,
+    d_values_in,
+    d_values_out,
+    segment_sizes,
+    oob_default,
+    warp_stable_sort_pairs_t{});
+
+  // Prepare verification data
+  thrust::host_vector<key_type> h_keys_in_out     = d_keys_in;
+  thrust::host_vector<value_type> h_values_in_out = d_values_in;
+  auto cpu_kv_pairs = thrust::make_zip_iterator(h_keys_in_out.begin(), h_values_in_out.begin());
+  compute_host_reference(cpu_kv_pairs,
+                         segment_sizes,
+                         params::total_warps,
+                         thrust::make_tuple(oob_default, value_type{}),
+                         params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_keys_in_out == d_keys_out);
+  REQUIRE(h_values_in_out == d_values_out);
+}
+
+CUB_TEST("Warp sort on key-value pairs of a partial warp-tile works",
+         "[sort][warp]",
+         key_types,
+         logical_warp_threads,
+         items_per_thread_list,
+         stability_list,
+         value_types)
+{
+  using params             = params_t<TestType>;
+  using key_type           = typename params::type;
+  using value_type         = typename c2h::get<4, TestType>;
+  using warp_sort_delegate = cub::detail::
+    conditional_t<params::is_stable, warp_partial_stable_sort_pairs_t, warp_partial_sort_pairs_t>;
+
+  // Prepare test data
+  thrust::device_vector<key_type> d_keys_in(params::tile_size);
+  thrust::device_vector<key_type> d_keys_out(params::tile_size);
+  thrust::device_vector<value_type> d_values_in(params::tile_size);
+  thrust::device_vector<value_type> d_values_out(params::tile_size);
+  thrust::device_vector<int> d_segment_sizes(params::total_warps);
+  const auto oob_default = std::numeric_limits<key_type>::max();
+  c2h::gen(CUB_SEED(5), d_keys_in);
+  c2h::gen(CUB_SEED(5), d_segment_sizes, 0, params::logical_warp_items);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_keys_in,
+    d_keys_out,
+    d_values_in,
+    d_values_out,
+    d_segment_sizes.cbegin(),
+    oob_default,
+    warp_sort_delegate{});
+
+  // Prepare verification data
+  thrust::host_vector<key_type> h_keys_in_out     = d_keys_in;
+  thrust::host_vector<value_type> h_values_in_out = d_values_in;
+  thrust::host_vector<int> segment_sizes          = d_segment_sizes;
+  auto cpu_kv_pairs = thrust::make_zip_iterator(h_keys_in_out.begin(), h_values_in_out.begin());
+  compute_host_reference(cpu_kv_pairs,
+                         segment_sizes,
+                         params::total_warps,
+                         thrust::make_tuple(oob_default, value_type{}),
+                         params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_keys_in_out == d_keys_out);
+  REQUIRE(h_values_in_out == d_values_out);
+}
diff --git a/include/cub/test/catch2_test_warp_reduce.cu b/include/cub/test/catch2_test_warp_reduce.cu
new file mode 100644
index 0000000..bfd3a6f
--- /dev/null
+++ b/include/cub/test/catch2_test_warp_reduce.cu
@@ -0,0 +1,608 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cuda/std/functional>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "c2h/custom_type.cuh"
+#include "catch2_test_helper.h"
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, typename T, typename ActionT>
+__global__ void warp_reduce_kernel(T *in, T *out, ActionT action)
+{
+  using warp_reduce_t = cub::WarpReduce<T, LOGICAL_WARP_THREADS>;
+  using storage_t     = typename warp_reduce_t::TempStorage;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = threadIdx.x;
+
+  // Get warp index
+  int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  // Load data
+  T thread_data = in[tid];
+  // Instantiate and run warp reduction
+  warp_reduce_t warp_reduce(storage[warp_id]);
+  auto result = action(tid, warp_reduce, thread_data);
+
+  // Write warp aggregate
+  out[tid] = result;
+}
+
+/**
+ * @brief Delegate wrapper for WarpReduce::Sum
+ */
+template <typename T>
+struct warp_sum_t
+{
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid,
+                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                          T &thread_data) const
+  {
+    auto result = warp_reduce.Sum(thread_data);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpReduce::Sum
+ */
+template <typename T>
+struct warp_sum_partial_t
+{
+  int num_valid;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ __forceinline__ T operator()(int linear_tid,
+                                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                                          T &thread_data) const
+  {
+    auto result = warp_reduce.Sum(thread_data, num_valid);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::Reduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_reduce_t
+{
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ __forceinline__ T operator()(int linear_tid,
+                                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                                          T &thread_data) const
+  {
+    auto result = warp_reduce.Reduce(thread_data, reduction_op);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpReduce::Reduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_reduce_partial_t
+{
+  int num_valid;
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid,
+                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                          T &thread_data) const
+  {
+    auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::TailSegmentedSum
+ */
+template <typename T>
+struct warp_seg_sum_tail_t
+{
+  uint8_t *d_flags;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid,
+                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                          T &thread_data) const
+  {
+    const bool has_agg = (linear_tid % LOGICAL_WARP_THREADS == 0) ||
+                         ((linear_tid == 0) ? 0 : d_flags[linear_tid - 1]);
+    auto result = warp_reduce.TailSegmentedSum(thread_data, d_flags[linear_tid]);
+    return has_agg ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::HeadSegmentedSum
+ */
+template <typename T>
+struct warp_seg_sum_head_t
+{
+  uint8_t *d_flags;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid,
+                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                          T &thread_data) const
+  {
+    const bool has_agg = ((linear_tid % LOGICAL_WARP_THREADS == 0) || d_flags[linear_tid]);
+    auto result        = warp_reduce.HeadSegmentedSum(thread_data, d_flags[linear_tid]);
+    return (has_agg) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::TailSegmentedReduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_seg_reduce_tail_t
+{
+  uint8_t *d_flags;
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid,
+                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                          T &thread_data) const
+  {
+    const bool has_agg = (linear_tid % LOGICAL_WARP_THREADS == 0) ||
+                         ((linear_tid == 0) ? 0 : d_flags[linear_tid - 1]);
+    auto result = warp_reduce.TailSegmentedReduce(thread_data, d_flags[linear_tid], reduction_op);
+    return has_agg ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::HeadSegmentedReduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_seg_reduce_head_t
+{
+  uint8_t *d_flags;
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid,
+                          cub::WarpReduce<T, LOGICAL_WARP_THREADS> &warp_reduce,
+                          T &thread_data) const
+  {
+    const bool has_agg = ((linear_tid % LOGICAL_WARP_THREADS == 0) || d_flags[linear_tid]);
+    auto result = warp_reduce.HeadSegmentedReduce(thread_data, d_flags[linear_tid], reduction_op);
+    return (has_agg) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Dispatch helper function
+ */
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, typename T, typename ActionT>
+void warp_reduce(thrust::device_vector<T> &in, thrust::device_vector<T> &out, ActionT action)
+{
+  warp_reduce_kernel<LOGICAL_WARP_THREADS, TOTAL_WARPS, T, ActionT>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(thrust::raw_pointer_cast(in.data()),
+                                                thrust::raw_pointer_cast(out.data()),
+                                                action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Compares the results returned from system under test against the expected results.
+ */
+template <typename T,
+          typename ::cuda::std::enable_if<::cuda::std::is_floating_point<T>::value, int>::type = 0>
+void verify_results(const thrust::host_vector<T> &expected_data,
+                    const thrust::device_vector<T> &test_results)
+{
+  REQUIRE_APPROX_EQ(expected_data, test_results);
+}
+
+/**
+ * @brief Compares the results returned from system under test against the expected results.
+ */
+template <typename T,
+          typename ::cuda::std::enable_if<!::cuda::std::is_floating_point<T>::value, int>::type = 0>
+void verify_results(const thrust::host_vector<T> &expected_data,
+                    const thrust::device_vector<T> &test_results)
+{
+  REQUIRE(expected_data == test_results);
+}
+
+enum class reduce_mode
+{
+  all,
+  partial,
+  head_flags,
+  tail_flags,
+};
+
+template <typename InputItT, typename FlagInputItT, typename ReductionOp, typename ResultOutItT>
+void compute_host_reference(reduce_mode mode,
+                            InputItT h_in,
+                            FlagInputItT h_flags,
+                            int warps,
+                            int logical_warp_threads,
+                            int valid_warp_threads,
+                            ReductionOp reduction_op,
+                            ResultOutItT h_data_out)
+{
+  // Accumulate segments (lane 0 of each warp is implicitly a segment head)
+  for (int warp = 0; warp < warps; ++warp)
+  {
+    int warp_offset = warp * logical_warp_threads;
+    int item_offset = warp_offset + valid_warp_threads - 1;
+
+    // Last item in warp
+    auto head_aggregate = h_in[item_offset];
+    auto tail_aggregate = h_in[item_offset];
+
+    if (mode != reduce_mode::tail_flags && h_flags[item_offset])
+    {
+      h_data_out[item_offset] = head_aggregate;
+    }
+    item_offset--;
+
+    // Work backwards
+    while (item_offset >= warp_offset)
+    {
+      if (h_flags[item_offset + 1])
+      {
+        head_aggregate = h_in[item_offset];
+      }
+      else
+      {
+        head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
+      }
+
+      if (h_flags[item_offset])
+      {
+        if (mode == reduce_mode::head_flags)
+        {
+          h_data_out[item_offset] = head_aggregate;
+        }
+        else if (mode == reduce_mode::tail_flags)
+        {
+          h_data_out[item_offset + 1] = tail_aggregate;
+          tail_aggregate              = h_in[item_offset];
+        }
+      }
+      else
+      {
+        tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
+      }
+
+      item_offset--;
+    }
+
+    // Record last segment aggregate
+    if (mode == reduce_mode::tail_flags)
+    {
+      h_data_out[warp_offset] = tail_aggregate;
+    }
+    else
+    {
+      h_data_out[warp_offset] = head_aggregate;
+    }
+  }
+}
+
+// List of types to test
+using custom_t       = c2h::custom_type_t<c2h::accumulateable_t,
+                                    c2h::equal_comparable_t,
+                                    c2h::lexicographical_less_comparable_t>;
+using full_type_list = c2h::type_list<std::uint8_t,
+                                      std::uint16_t,
+                                      std::int32_t,
+                                      std::int64_t,
+                                      custom_t,
+                                      ulonglong4,
+                                      uchar3,
+                                      short2>;
+
+using builtin_type_list = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+
+// Logical warp sizes to test
+using logical_warp_threads = c2h::enum_type_list<int, 32, 16, 9, 7, 1>;
+
+using segmented_modes =
+  c2h::enum_type_list<reduce_mode, reduce_mode::head_flags, reduce_mode::tail_flags>;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value() { return total_warps; }
+};
+
+template <typename TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size            = total_warps * logical_warp_threads;
+};
+
+CUB_TEST("Warp sum works", "[reduce][warp]", full_type_list, logical_warp_threads)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  const auto valid_items = params::logical_warp_threads;
+  c2h::gen(CUB_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_sum_t<type>{});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in  = d_in;
+  thrust::host_vector<type> h_out = h_in;
+  auto h_flags                    = thrust::make_constant_iterator(false);
+  compute_host_reference(reduce_mode::all,
+                         h_in,
+                         h_flags,
+                         params::total_warps,
+                         params::logical_warp_threads,
+                         valid_items,
+                         ::cuda::std::plus<type>{},
+                         h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+CUB_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = cub::Min;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  const auto valid_items = params::logical_warp_threads;
+  c2h::gen(CUB_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in,
+                                                                 d_out,
+                                                                 warp_reduce_t<type, red_op_t>{
+                                                                   red_op_t{}});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in  = d_in;
+  thrust::host_vector<type> h_out = h_in;
+  auto h_flags                    = thrust::make_constant_iterator(false);
+  compute_host_reference(reduce_mode::all,
+                         h_in,
+                         h_flags,
+                         params::total_warps,
+                         params::logical_warp_threads,
+                         valid_items,
+                         red_op_t{},
+                         h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+CUB_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
+  c2h::gen(CUB_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in,
+                                                                 d_out,
+                                                                 warp_sum_partial_t<type>{
+                                                                   valid_items});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in  = d_in;
+  thrust::host_vector<type> h_out = h_in;
+  auto h_flags                    = thrust::make_constant_iterator(false);
+  compute_host_reference(reduce_mode::all,
+                         h_in,
+                         h_flags,
+                         params::total_warps,
+                         params::logical_warp_threads,
+                         valid_items,
+                         ::cuda::std::plus<type>{},
+                         h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+CUB_TEST("Warp reduce on partial warp works",
+         "[reduce][warp]",
+         builtin_type_list,
+         logical_warp_threads)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = cub::Min;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
+  c2h::gen(CUB_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    warp_reduce_partial_t<type, red_op_t>{valid_items, red_op_t{}});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in  = d_in;
+  thrust::host_vector<type> h_out = h_in;
+  auto h_flags                    = thrust::make_constant_iterator(false);
+  compute_host_reference(reduce_mode::all,
+                         h_in,
+                         h_flags,
+                         params::total_warps,
+                         params::logical_warp_threads,
+                         valid_items,
+                         red_op_t{},
+                         h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+CUB_TEST("Warp segmented sum works",
+         "[reduce][warp]",
+         full_type_list,
+         logical_warp_threads,
+         segmented_modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  constexpr auto segmented_mod = c2h::get<2, TestType>::value;
+  static_assert(segmented_mod == reduce_mode::tail_flags ||
+                  segmented_mod == reduce_mode::head_flags,
+                "Segmented tests must either be head or tail flags");
+  using warp_seg_sum_t = cub::detail::conditional_t<(segmented_mod == reduce_mode::tail_flags),
+                                                    warp_seg_sum_tail_t<type>,
+                                                    warp_seg_sum_head_t<type>>;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<uint8_t> d_flags(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  const auto valid_items = params::logical_warp_threads;
+  constexpr uint8_t min  = 0;
+  constexpr uint8_t max  = 2;
+  c2h::gen(CUB_SEED(5), d_in);
+  c2h::gen(CUB_SEED(5), d_flags, min, max);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    warp_seg_sum_t{thrust::raw_pointer_cast(d_flags.data())});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in       = d_in;
+  thrust::host_vector<uint8_t> h_flags = d_flags;
+  thrust::host_vector<type> h_out      = h_in;
+  compute_host_reference(segmented_mod,
+                         h_in,
+                         h_flags,
+                         params::total_warps,
+                         params::logical_warp_threads,
+                         valid_items,
+                         ::cuda::std::plus<type>{},
+                         h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+CUB_TEST("Warp segmented reduction works",
+         "[reduce][warp]",
+         builtin_type_list,
+         logical_warp_threads,
+         segmented_modes)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = cub::Min;
+
+  constexpr auto segmented_mod = c2h::get<2, TestType>::value;
+  static_assert(segmented_mod == reduce_mode::tail_flags ||
+                  segmented_mod == reduce_mode::head_flags,
+                "Segmented tests must either be head or tail flags");
+  using warp_seg_reduction_t =
+    cub::detail::conditional_t<(segmented_mod == reduce_mode::tail_flags),
+                               warp_seg_reduce_tail_t<type, red_op_t>,
+                               warp_seg_reduce_head_t<type, red_op_t>>;
+
+  // Prepare test data
+  thrust::device_vector<type> d_in(params::tile_size);
+  thrust::device_vector<uint8_t> d_flags(params::tile_size);
+  thrust::device_vector<type> d_out(params::tile_size);
+  const auto valid_items = params::logical_warp_threads;
+  constexpr uint8_t min  = 0;
+  constexpr uint8_t max  = 2;
+  c2h::gen(CUB_SEED(5), d_in);
+  c2h::gen(CUB_SEED(5), d_flags, min, max);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    warp_seg_reduction_t{thrust::raw_pointer_cast(d_flags.data()), red_op_t{}});
+
+  // Prepare verification data
+  thrust::host_vector<type> h_in       = d_in;
+  thrust::host_vector<uint8_t> h_flags = d_flags;
+  thrust::host_vector<type> h_out      = h_in;
+  compute_host_reference(segmented_mod,
+                         h_in,
+                         h_flags,
+                         params::total_warps,
+                         params::logical_warp_threads,
+                         valid_items,
+                         red_op_t{},
+                         h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
\ No newline at end of file
diff --git a/include/cub/test/catch2_test_warp_scan.cu b/include/cub/test/catch2_test_warp_scan.cu
new file mode 100644
index 0000000..5a5ba2b
--- /dev/null
+++ b/include/cub/test/catch2_test_warp_scan.cu
@@ -0,0 +1,689 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/warp/warp_scan.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+__global__ void warp_combine_scan_kernel(T *in, T *inclusive_out, T *exclusive_out, ActionT action)
+{
+  using warp_scan_t = cub::WarpScan<T, LOGICAL_WARP_THREADS>;
+  using storage_t   = typename warp_scan_t::TempStorage;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  T inc_out, exc_out;
+  T thread_data = in[tid];
+
+  warp_scan_t scan(storage[warp_id]);
+
+  action(scan, thread_data, inc_out, exc_out);
+
+  inclusive_out[tid] = inc_out;
+  exclusive_out[tid] = exc_out;
+}
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+void warp_combine_scan(thrust::device_vector<T> &in,
+                       thrust::device_vector<T> &inclusive_out,
+                       thrust::device_vector<T> &exclusive_out,
+                       ActionT action)
+{
+  warp_combine_scan_kernel<LOGICAL_WARP_THREADS, TOTAL_WARPS, T, ActionT>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(thrust::raw_pointer_cast(in.data()),
+                                                thrust::raw_pointer_cast(inclusive_out.data()),
+                                                thrust::raw_pointer_cast(exclusive_out.data()),
+                                                action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+__global__ void warp_scan_kernel(T *in, T *out, ActionT action)
+{
+  using warp_scan_t = cub::WarpScan<T, LOGICAL_WARP_THREADS>;
+  using storage_t   = typename warp_scan_t::TempStorage;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  T thread_data = in[tid];
+
+  warp_scan_t scan(storage[warp_id]);
+
+  action(scan, thread_data);
+
+  out[tid] = thread_data;
+}
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+void warp_scan(thrust::device_vector<T> &in, thrust::device_vector<T> &out, ActionT action)
+{
+  warp_scan_kernel<LOGICAL_WARP_THREADS, TOTAL_WARPS, T, ActionT>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(thrust::raw_pointer_cast(in.data()),
+                                                thrust::raw_pointer_cast(out.data()),
+                                                action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+enum class scan_mode
+{
+  exclusive,
+  inclusive
+};
+
+template <scan_mode Mode>
+struct sum_op_t
+{
+  template <class WarpScanT, class T>
+  __device__ void operator()(WarpScanT &scan, T &thread_data) const
+  {
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data);
+    }
+    else
+    {
+      scan.InclusiveSum(thread_data, thread_data);
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct sum_aggregate_op_t
+{
+  int m_target_thread_id;
+  T *m_d_warp_aggregate;
+
+  template <int LOGICAL_WARP_THREADS>
+  __device__ void operator()(cub::WarpScan<T, LOGICAL_WARP_THREADS> &scan, T &thread_data) const
+  {
+    T warp_aggregate{};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data, warp_aggregate);
+    }
+    else
+    {
+      scan.InclusiveSum(thread_data, thread_data, warp_aggregate);
+    }
+
+    const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+    if (tid % LOGICAL_WARP_THREADS == m_target_thread_id)
+    {
+      m_d_warp_aggregate[tid / LOGICAL_WARP_THREADS] = warp_aggregate;
+    }
+  }
+};
+
+template <scan_mode Mode>
+struct min_op_t
+{
+  template <class T, class WarpScanT>
+  __device__ void operator()(WarpScanT &scan, T &thread_data) const
+  {
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_aggregate_op_t
+{
+  int m_target_thread_id;
+  T *m_d_warp_aggregate;
+
+  template <int LOGICAL_WARP_THREADS>
+  __device__ void operator()(cub::WarpScan<T, LOGICAL_WARP_THREADS> &scan, T &thread_data) const
+  {
+    T warp_aggregate{};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{}, warp_aggregate);
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{}, warp_aggregate);
+    }
+
+    const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+    if (tid % LOGICAL_WARP_THREADS == m_target_thread_id)
+    {
+      m_d_warp_aggregate[tid / LOGICAL_WARP_THREADS] = warp_aggregate;
+    }
+  }
+};
+
+template <class T>
+struct min_init_value_op_t
+{
+  T initial_value;
+  template <class WarpScanT>
+  __device__ void operator()(WarpScanT &scan, T &thread_data) const
+  {
+    scan.ExclusiveScan(thread_data, thread_data, initial_value, cub::Min{});
+  }
+};
+
+template <class T>
+struct min_init_value_aggregate_op_t
+{
+  int m_target_thread_id;
+  T initial_value;
+  T *m_d_warp_aggregate;
+
+  template <int LOGICAL_WARP_THREADS>
+  __device__ void operator()(cub::WarpScan<T, LOGICAL_WARP_THREADS> &scan, T &thread_data) const
+  {
+    T warp_aggregate{};
+
+    scan.ExclusiveScan(thread_data, thread_data, initial_value, cub::Min{}, warp_aggregate);
+
+    const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+    if (tid % LOGICAL_WARP_THREADS == m_target_thread_id)
+    {
+      m_d_warp_aggregate[tid / LOGICAL_WARP_THREADS] = warp_aggregate;
+    }
+  }
+};
+
+struct min_scan_op_t
+{
+  template <class T, class WarpScanT>
+  __device__ void
+  operator()(WarpScanT &scan, T &thread_data, T &inclusive_output, T &exclusive_output) const
+  {
+    scan.Scan(thread_data, inclusive_output, exclusive_output, cub::Min{});
+  }
+};
+
+template <class T>
+struct min_init_value_scan_op_t
+{
+  T initial_value;
+  template <class WarpScanT>
+  __device__ void
+  operator()(WarpScanT &scan, T &thread_data, T &inclusive_output, T &exclusive_output) const
+  {
+    scan.Scan(thread_data, inclusive_output, exclusive_output, initial_value, cub::Min{});
+  }
+};
+
+template <class T, class ScanOpT>
+thrust::host_vector<T> compute_host_reference(scan_mode mode,
+                                              thrust::host_vector<T> &result,
+                                              int logical_warp_threads,
+                                              ScanOpT scan_op,
+                                              T initial_value = T{})
+{
+  if (result.empty())
+  {
+    return thrust::host_vector<T>{};
+  }
+  // TODO : assert result.size() % logical_warp_threads == 0
+
+  int num_warps = CUB_QUOTIENT_CEILING(static_cast<int>(result.size()), logical_warp_threads);
+  thrust::host_vector<T> warp_accumulator(num_warps);
+  if (mode == scan_mode::exclusive)
+  {
+    for (int w = 0; w < num_warps; ++w)
+    {
+      T *output     = result.data() + w * logical_warp_threads;
+      T accumulator = output[0];
+      T current     = static_cast<T>(scan_op(initial_value, output[0]));
+      output[0]     = initial_value;
+      for (int i = 1; i < logical_warp_threads; i++)
+      {
+        accumulator = static_cast<T>(scan_op(accumulator, output[i]));
+        T tmp       = output[i];
+        output[i]   = current;
+        current     = static_cast<T>(scan_op(current, tmp));
+      }
+      warp_accumulator[w] = accumulator;
+    }
+  }
+  else
+  {
+    for (int w = 0; w < num_warps; ++w)
+    {
+      T *output = result.data() + w * logical_warp_threads;
+      T current = initial_value;
+      for (int i = 0; i < logical_warp_threads; i++)
+      {
+        current   = static_cast<T>(scan_op(current, output[i]));
+        output[i] = current;
+      }
+      warp_accumulator[w] = current;
+    }
+  }
+
+  return warp_accumulator;
+}
+
+using types = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using logical_warp_threads = c2h::enum_type_list<int, 32, 16, 9, 2>;
+using modes = c2h::enum_type_list<scan_mode, scan_mode::exclusive, scan_mode::inclusive>;
+
+using vec_types = c2h::type_list<ulonglong4, uchar3, short2>;
+
+using warp_combine_type = int;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value() { return total_warps; }
+};
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr scan_mode mode           = c2h::get<2, TestType>::value;
+  static constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size            = total_warps * logical_warp_threads;
+};
+
+CUB_TEST("Warp scan works with sum", "[scan][warp]", types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in,
+                                                               d_out,
+                                                               sum_op_t<params::mode>{});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+CUB_TEST("Warp scan works with vec_types", "[scan][warp]", vec_types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in,
+                                                               d_out,
+                                                               sum_op_t<params::mode>{});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE(h_out == d_out);
+}
+
+CUB_TEST("Warp scan works with custom types",
+         "[scan][warp]",
+         c2h::type_list<c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>>,
+         logical_warp_threads,
+         modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in,
+                                                               d_out,
+                                                               sum_op_t<params::mode>{});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE(h_out == d_out);
+}
+
+CUB_TEST("Warp scan returns valid warp aggregate",
+         "[scan][warp]",
+         c2h::type_list<c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>>,
+         logical_warp_threads,
+         modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_warp_aggregates(params::total_warps);
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::logical_warp_threads - 1)));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    sum_aggregate_op_t<type, params::mode>{target_thread_id,
+                                           thrust::raw_pointer_cast(d_warp_aggregates.data())});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  auto h_warp_aggregates =
+    compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE(h_out == d_out);
+  REQUIRE(h_warp_aggregates == d_warp_aggregates);
+}
+
+// TODO : Do we need all the types?
+CUB_TEST("Warp scan works with custom scan op", "[scan][warp]", types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in,
+                                                               d_out,
+                                                               min_op_t<params::mode>{});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    std::numeric_limits<type>::max());
+
+  // From the documentation -
+  // Computes an exclusive prefix scan using the specified binary scan functor
+  // across the calling warp. Because no initial value is supplied, the output
+  // computed for warp-lane0 is undefined.
+
+  // When comparing device output, the corresponding undefined data points need
+  // to be fixed
+
+  if (params::mode == scan_mode::exclusive)
+  {
+    for (size_t i = 0; i < h_out.size(); i += params::logical_warp_threads)
+    {
+      d_out[i] = h_out[i];
+    }
+  }
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+CUB_TEST("Warp custom op scan returns valid warp aggregate",
+         "[scan][warp]",
+         types,
+         logical_warp_threads,
+         modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_warp_aggregates(params::total_warps);
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::logical_warp_threads - 1)));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    min_aggregate_op_t<type, params::mode>{target_thread_id,
+                                           thrust::raw_pointer_cast(d_warp_aggregates.data())});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  auto h_warp_aggregates = compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    std::numeric_limits<type>::max());
+
+  // From the documentation -
+  // Computes an exclusive prefix scan using the specified binary scan functor
+  // across the calling warp. Because no initial value is supplied, the output
+  // computed for warp-lane0 is undefined.
+
+  // When comparing device output, the corresponding undefined data points need
+  // to be fixed
+
+  if (params::mode == scan_mode::exclusive)
+  {
+    for (size_t i = 0; i < h_out.size(); i += params::logical_warp_threads)
+    {
+      d_out[i] = h_out[i];
+    }
+  }
+  REQUIRE(h_out == d_out);
+  REQUIRE(h_warp_aggregates == d_warp_aggregates);
+}
+
+CUB_TEST("Warp custom op scan works with initial value",
+         "[scan][warp]",
+         types,
+         logical_warp_threads,
+         c2h::enum_type_list<scan_mode, scan_mode::exclusive>)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  const type initial_value = static_cast<type>(GENERATE_COPY(take(2, random(0, params::tile_size))));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in,
+                                                               d_out,
+                                                               min_init_value_op_t<type>{
+                                                                 initial_value});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    initial_value);
+
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+CUB_TEST("Warp custom op scan with initial value returns valid warp aggregate",
+         "[scan][warp]",
+         types,
+         logical_warp_threads,
+         c2h::enum_type_list<scan_mode, scan_mode::exclusive>)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_warp_aggregates(params::total_warps);
+  thrust::device_vector<type> d_out(params::tile_size);
+  thrust::device_vector<type> d_in(params::tile_size);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::logical_warp_threads - 1)));
+  const type initial_value   = static_cast<type>(GENERATE_COPY(take(2, random(0, params::tile_size))));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    min_init_value_aggregate_op_t<type>{target_thread_id,
+                                        initial_value,
+                                        thrust::raw_pointer_cast(d_warp_aggregates.data())});
+
+  thrust::host_vector<type> h_out = d_in;
+
+  auto h_warp_aggregates = compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    initial_value);
+
+  REQUIRE(h_out == d_out);
+  REQUIRE(h_warp_aggregates == d_warp_aggregates);
+}
+
+CUB_TEST("Warp combination scan works with custom scan op", "[scan][warp]", logical_warp_threads)
+{
+  constexpr int logical_warp_threads = c2h::get<0, TestType>();
+  constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  using type                         = int;
+
+  thrust::device_vector<type> d_inclusive_out(total_warps * logical_warp_threads);
+  thrust::device_vector<type> d_exclusive_out(total_warps * logical_warp_threads);
+  thrust::device_vector<type> d_in(total_warps * logical_warp_threads);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  warp_combine_scan<logical_warp_threads, total_warps>(d_in,
+                                                       d_inclusive_out,
+                                                       d_exclusive_out,
+                                                       min_scan_op_t{});
+
+  thrust::host_vector<type> h_exclusive_out = d_in;
+  thrust::host_vector<type> h_inclusive_out = d_in;
+
+  compute_host_reference(
+    scan_mode::exclusive,
+    h_exclusive_out,
+    logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    std::numeric_limits<type>::max());
+
+  compute_host_reference(
+    scan_mode::inclusive,
+    h_inclusive_out,
+    logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    std::numeric_limits<type>::max());
+
+  // According to WarpScan::Scan documentation -
+  // Because no initial value is supplied, the exclusive_output computed for warp-lane0 is
+  // undefined.
+
+  // When comparing device output, the corresponding undefined data points need
+  // to be fixed
+
+  for (size_t i = 0; i < h_exclusive_out.size(); i += logical_warp_threads)
+  {
+    d_exclusive_out[i] = h_exclusive_out[i];
+  }
+
+  REQUIRE(h_inclusive_out == d_inclusive_out);
+  REQUIRE(h_exclusive_out == d_exclusive_out);
+}
+
+CUB_TEST("Warp combination custom scan works with initial value",
+         "[scan][warp]",
+         logical_warp_threads)
+{
+  constexpr int logical_warp_threads = c2h::get<0, TestType>();
+  constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  using type                         = int;
+
+  thrust::device_vector<type> d_inclusive_out(total_warps * logical_warp_threads);
+  thrust::device_vector<type> d_exclusive_out(total_warps * logical_warp_threads);
+  thrust::device_vector<type> d_in(total_warps * logical_warp_threads);
+  c2h::gen(CUB_SEED(10), d_in);
+
+  const type initial_value = GENERATE_COPY(take(2, random(0, total_warps * logical_warp_threads)));
+
+  warp_combine_scan<logical_warp_threads, total_warps>(d_in,
+                                                       d_inclusive_out,
+                                                       d_exclusive_out,
+                                                       min_init_value_scan_op_t<type>{
+                                                         initial_value});
+
+  thrust::host_vector<type> h_exclusive_out = d_in;
+  thrust::host_vector<type> h_inclusive_out = d_in;
+
+  compute_host_reference(
+    scan_mode::exclusive,
+    h_exclusive_out,
+    logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    initial_value);
+
+  compute_host_reference(
+    scan_mode::inclusive,
+    h_inclusive_out,
+    logical_warp_threads,
+    [](type l, type r) { return std::min(l, r); },
+    initial_value);
+
+  REQUIRE(h_inclusive_out == d_inclusive_out);
+  REQUIRE(h_exclusive_out == d_exclusive_out);
+}
diff --git a/include/cub/test/catch2_test_warp_store.cu b/include/cub/test/catch2_test_warp_store.cu
new file mode 100644
index 0000000..7c9c157
--- /dev/null
+++ b/include/cub/test/catch2_test_warp_store.cu
@@ -0,0 +1,314 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/detail/cpp_compatibility.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/warp/warp_store.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "fill_striped.cuh"
+// Has to go after all cub headers. Otherwise, this test won't catch unused
+// variables in cub kernels.
+#include "catch2_test_helper.h"
+
+template <cub::WarpStoreAlgorithm StoreAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename OutputIteratorT,
+          typename ActionT>
+__global__ void warp_store_kernel(OutputIteratorT output_iterator, ActionT action)
+{
+  using warp_store_t = cub::WarpStore<T, ITEMS_PER_THREAD, StoreAlgorithm, LOGICAL_WARP_THREADS>;
+  using storage_t    = typename warp_store_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+  T reg[ITEMS_PER_THREAD];
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    reg[item] = static_cast<T>(tid * ITEMS_PER_THREAD + item);
+  }
+
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+  warp_store_t store(storage[warp_id]);
+
+  action(store, output_iterator + (warp_id * tile_size), reg);
+}
+
+template <cub::WarpStoreAlgorithm StoreAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename OutputIteratorT,
+          typename ActionT>
+void warp_store(OutputIteratorT output_iterator, ActionT action)
+{
+  warp_store_kernel<StoreAlgorithm,
+                    LOGICAL_WARP_THREADS,
+                    ITEMS_PER_THREAD,
+                    TOTAL_WARPS,
+                    T,
+                    OutputIteratorT,
+                    ActionT><<<1, TOTAL_WARPS * LOGICAL_WARP_THREADS>>>(output_iterator, action);
+}
+
+struct guarded_store_t
+{
+  int valid_items;
+  template <cub::WarpStoreAlgorithm StoreAlgorithm,
+            int LOGICAL_WARP_THREADS,
+            int ITEMS_PER_THREAD,
+            typename T,
+            typename OutputIteratorT>
+  __device__ void
+  operator()(cub::WarpStore<T, ITEMS_PER_THREAD, StoreAlgorithm, LOGICAL_WARP_THREADS> store,
+             OutputIteratorT output,
+             T (&reg)[ITEMS_PER_THREAD])
+  {
+    store.Store(output, reg, valid_items);
+  }
+};
+
+struct unguarded_store_t
+{
+  template <cub::WarpStoreAlgorithm StoreAlgorithm,
+            int LOGICAL_WARP_THREADS,
+            int ITEMS_PER_THREAD,
+            typename T,
+            typename OutputIteratorT>
+  __device__ void
+  operator()(cub::WarpStore<T, ITEMS_PER_THREAD, StoreAlgorithm, LOGICAL_WARP_THREADS> store,
+             OutputIteratorT output,
+             T (&reg)[ITEMS_PER_THREAD])
+  {
+    store.Store(output, reg);
+  }
+};
+
+template <cub::WarpStoreAlgorithm StoreAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T>
+thrust::device_vector<T> compute_reference(int valid_items)
+{
+  const int tile_size        = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD;
+  const int total_item_count = TOTAL_WARPS * tile_size;
+  thrust::device_vector<T> d_input(total_item_count);
+
+  CUB_IF_CONSTEXPR(StoreAlgorithm == cub::WarpStoreAlgorithm::WARP_STORE_STRIPED)
+  {
+    thrust::host_vector<T> input(total_item_count);
+    fill_striped<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD * TOTAL_WARPS>(
+      input.begin());
+    d_input = input;
+  }
+  else
+  {
+    c2h::gen(c2h::modulo_t{d_input.size()}, d_input);
+  }
+  if (valid_items != total_item_count)
+  {
+    for (int warp_id = 0; warp_id < TOTAL_WARPS; warp_id++)
+    {
+      thrust::fill(d_input.begin() + warp_id * tile_size + valid_items,
+                   d_input.begin() + (warp_id + 1) * tile_size,
+                   T{});
+    }
+  }
+  return d_input;
+}
+
+// %PARAM% LWT lwt 4:16:32
+// %PARAM% ALGO_TYPE alg 0:1:2:3
+
+using types            = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using items_per_thread = c2h::enum_type_list<int, 1, 4, 7>;
+using logical_warp_threads = c2h::enum_type_list<int, LWT>;
+using algorithms           = c2h::enum_type_list<cub::WarpStoreAlgorithm,
+                                       cub::WarpStoreAlgorithm::WARP_STORE_DIRECT,
+                                       cub::WarpStoreAlgorithm::WARP_STORE_STRIPED,
+                                       cub::WarpStoreAlgorithm::WARP_STORE_TRANSPOSE,
+                                       cub::WarpStoreAlgorithm::WARP_STORE_VECTORIZE>;
+using algorithm =
+  c2h::enum_type_list<cub::WarpStoreAlgorithm, c2h::get<ALGO_TYPE, algorithms>::value>;
+
+using cache_store_modifier = c2h::enum_type_list<cub::CacheStoreModifier,
+                                                 cub::CacheStoreModifier::STORE_DEFAULT,
+                                                 cub::CacheStoreModifier::STORE_WB,
+                                                 cub::CacheStoreModifier::STORE_CG,
+                                                 cub::CacheStoreModifier::STORE_CS,
+                                                 cub::CacheStoreModifier::STORE_WT,
+                                                 cub::CacheStoreModifier::STORE_VOLATILE>;
+
+constexpr int guarded_store_tests_count = 30;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value() { return total_warps; }
+};
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads          = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread              = c2h::get<2, TestType>::value;
+  static constexpr cub::WarpStoreAlgorithm algorithm = c2h::get<3, TestType>::value;
+  static constexpr int total_warps                   = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size                     = logical_warp_threads * items_per_thread;
+  static constexpr int total_item_count              = total_warps * tile_size;
+};
+
+CUB_TEST("Warp store guarded range works with pointer",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::total_item_count, type{});
+  const int valid_items =
+    GENERATE_COPY(take(guarded_store_tests_count, random(0, params::tile_size - 1)));
+  auto out = thrust::raw_pointer_cast(d_out.data());
+  warp_store<params::algorithm,
+             params::logical_warp_threads,
+             params::items_per_thread,
+             params::total_warps,
+             type>(out, guarded_store_t{valid_items});
+  auto d_expected_output = compute_reference<params::algorithm,
+                                             params::logical_warp_threads,
+                                             params::items_per_thread,
+                                             params::total_warps,
+                                             type>(valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
+
+CUB_TEST("Warp store guarded range works with cache modified iterator",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_store_modifier)
+{
+  using params                                     = params_t<TestType>;
+  using type                                       = typename params::type;
+  constexpr cub::CacheStoreModifier store_modifier = c2h::get<4, TestType>::value;
+
+  thrust::device_vector<type> d_out(params::total_item_count, type{});
+  const int valid_items =
+    GENERATE_COPY(take(guarded_store_tests_count, random(0, params::tile_size - 1)));
+  auto out =
+    cub::CacheModifiedOutputIterator<store_modifier, type>(thrust::raw_pointer_cast(d_out.data()));
+  warp_store<params::algorithm,
+             params::logical_warp_threads,
+             params::items_per_thread,
+             params::total_warps,
+             type>(out, guarded_store_t{valid_items});
+  auto d_expected_output = compute_reference<params::algorithm,
+                                             params::logical_warp_threads,
+                                             params::items_per_thread,
+                                             params::total_warps,
+                                             type>(valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
+
+CUB_TEST("Warp store unguarded range works with pointer",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  thrust::device_vector<type> d_out(params::total_item_count, type{});
+  const int valid_items = params::tile_size;
+  auto out              = thrust::raw_pointer_cast(d_out.data());
+  warp_store<params::algorithm,
+             params::logical_warp_threads,
+             params::items_per_thread,
+             params::total_warps,
+             type>(out, unguarded_store_t{});
+  auto d_expected_output = compute_reference<params::algorithm,
+                                             params::logical_warp_threads,
+                                             params::items_per_thread,
+                                             params::total_warps,
+                                             type>(valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
+
+CUB_TEST("Warp store unguarded range works with cache modified iterator",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_store_modifier)
+{
+  using params                                     = params_t<TestType>;
+  using type                                       = typename params::type;
+  constexpr cub::CacheStoreModifier store_modifier = c2h::get<4, TestType>::value;
+
+  thrust::device_vector<type> d_out(params::total_item_count, type{});
+  const int valid_items = params::tile_size;
+  auto out =
+    cub::CacheModifiedOutputIterator<store_modifier, type>(thrust::raw_pointer_cast(d_out.data()));
+  warp_store<params::algorithm,
+             params::logical_warp_threads,
+             params::items_per_thread,
+             params::total_warps,
+             type>(out, unguarded_store_t{});
+  auto d_expected_output = compute_reference<params::algorithm,
+                                             params::logical_warp_threads,
+                                             params::items_per_thread,
+                                             params::total_warps,
+                                             type>(valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
diff --git a/include/cub/test/cmake/CMakeLists.txt b/include/cub/test/cmake/CMakeLists.txt
new file mode 100644
index 0000000..1f3ae43
--- /dev/null
+++ b/include/cub/test/cmake/CMakeLists.txt
@@ -0,0 +1,24 @@
+if (NOT CUB_IN_THRUST) # Thrust has its own checks for this:
+  # Test that we can use `find_package` on an installed CUB:
+  add_test(
+    NAME cub.test.cmake.test_install
+    COMMAND "${CMAKE_COMMAND}"
+      --log-level=VERBOSE
+      -G "${CMAKE_GENERATOR}"
+      -S "${CMAKE_CURRENT_SOURCE_DIR}/test_install"
+      -B "${CMAKE_CURRENT_BINARY_DIR}/test_install"
+      -D "CUB_BINARY_DIR=${CUB_BINARY_DIR}"
+      -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+      -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+      -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+  )
+endif()
+
+# Check source code for issues that can be found by pattern matching:
+add_test(
+  NAME cub.test.cmake.check_source_files
+  COMMAND
+    "${CMAKE_COMMAND}"
+      -D "CUB_SOURCE_DIR=${CUB_SOURCE_DIR}"
+      -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake"
+)
diff --git a/include/cub/test/cmake/check_source_files.cmake b/include/cub/test/cmake/check_source_files.cmake
new file mode 100644
index 0000000..1554a22
--- /dev/null
+++ b/include/cub/test/cmake/check_source_files.cmake
@@ -0,0 +1,178 @@
+# Check all source files for various issues that can be detected using pattern
+# matching.
+#
+# This is run as a ctest test named `cub.test.cmake.check_namespace`, or
+# manually with:
+# cmake -D "CUB_SOURCE_DIR=<CUB project root>" -P check_namespace.cmake
+
+cmake_minimum_required(VERSION 3.15)
+
+function(count_substrings input search_regex output_var)
+  string(REGEX MATCHALL "${search_regex}" matches "${input}")
+  list(LENGTH matches num_matches)
+  set(${output_var} ${num_matches} PARENT_SCOPE)
+endfunction()
+
+set(found_errors 0)
+file(GLOB_RECURSE cub_srcs
+  RELATIVE "${CUB_SOURCE_DIR}"
+  "${CUB_SOURCE_DIR}/cub/*.cuh"
+  "${CUB_SOURCE_DIR}/cub/*.cu"
+  "${CUB_SOURCE_DIR}/cub/*.h"
+  "${CUB_SOURCE_DIR}/cub/*.cpp"
+)
+
+################################################################################
+# Namespace checks.
+# Check all files in thrust to make sure that they use
+# CUB_NAMESPACE_BEGIN/END instead of bare `namespace cub {}` declarations.
+set(namespace_exclusions
+  # This defines the macros and must have bare namespace declarations:
+  cub/util_namespace.cuh
+)
+
+set(bare_ns_regex "namespace[ \n\r\t]+cub[ \n\r\t]*\\{")
+
+# Validation check for the above regex:
+count_substrings([=[
+namespace cub{
+namespace cub {
+namespace  cub  {
+ namespace cub {
+namespace cub
+{
+namespace
+cub
+{
+]=]
+  ${bare_ns_regex} valid_count)
+if (NOT valid_count EQUAL 6)
+  message(FATAL_ERROR "Validation of bare namespace regex failed: "
+                      "Matched ${valid_count} times, expected 6.")
+endif()
+
+################################################################################
+# stdpar header checks.
+# Check all files in CUB to make sure that they aren't including <algorithm>
+# or <memory>, both of which will cause circular dependencies in nvc++'s
+# stdpar library.
+#
+# The headers following headers should be used instead:
+# <algorithm> -> <thrust/detail/algorithm_wrapper.h>
+# <memory>    -> <thrust/detail/memory_wrapper.h>
+#
+set(stdpar_header_exclusions
+  # Placeholder -- none yet.
+)
+
+set(algorithm_regex "#[ \t]*include[ \t]+<algorithm>")
+set(memory_regex    "#[ \t]*include[ \t]+<memory>")
+set(numeric_regex   "#[ \t]*include[ \t]+<numeric>")
+
+# Validation check for the above regex pattern:
+count_substrings([=[
+#include <algorithm>
+# include <algorithm>
+#include  <algorithm>
+# include  <algorithm>
+# include  <algorithm> // ...
+]=]
+  ${algorithm_regex} valid_count)
+if (NOT valid_count EQUAL 5)
+  message(FATAL_ERROR "Validation of stdpar header regex failed: "
+    "Matched ${valid_count} times, expected 5.")
+endif()
+
+################################################################################
+# Legacy macro checks.
+# Check all files in CUB to make sure that they aren't using the legacy
+# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
+#
+# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
+# They are provided for legacy purposes and should be replaced with
+# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
+#
+#
+set(legacy_macro_header_exclusions
+  # This header defines a legacy CUDART macro:
+  cub/detail/detect_cuda_runtime.cuh
+)
+
+set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
+set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
+
+################################################################################
+# Read source files:
+foreach(src ${cub_srcs})
+  file(READ "${CUB_SOURCE_DIR}/${src}" src_contents)
+
+  if (NOT ${src} IN_LIST namespace_exclusions)
+    count_substrings("${src_contents}" "${bare_ns_regex}" bare_ns_count)
+    count_substrings("${src_contents}" CUB_NS_PREFIX prefix_count)
+    count_substrings("${src_contents}" CUB_NS_POSTFIX postfix_count)
+    count_substrings("${src_contents}" CUB_NAMESPACE_BEGIN begin_count)
+    count_substrings("${src_contents}" CUB_NAMESPACE_END end_count)
+
+    if (NOT bare_ns_count EQUAL 0)
+      message("'${src}' contains 'namespace cub {...}'. Replace with CUB_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT prefix_count EQUAL 0)
+      message("'${src}' contains 'CUB_NS_PREFIX'. Replace with CUB_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT postfix_count EQUAL 0)
+      message("'${src}' contains 'CUB_NS_POSTFIX'. Replace with CUB_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT begin_count EQUAL end_count)
+      message("'${src}' namespace macros are unbalanced:")
+      message(" - CUB_NAMESPACE_BEGIN occurs ${begin_count} times.")
+      message(" - CUB_NAMESPACE_END   occurs ${end_count} times.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST stdpar_header_exclusions)
+    count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count)
+    count_substrings("${src_contents}" "${memory_regex}" memory_count)
+    count_substrings("${src_contents}" "${numeric_regex}" numeric_count)
+
+    if (NOT algorithm_count EQUAL 0)
+      message("'${src}' includes the <algorithm> header. Replace with <thrust/detail/algorithm_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT memory_count EQUAL 0)
+      message("'${src}' includes the <memory> header. Replace with <thrust/detail/memory_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT numeric_count EQUAL 0)
+      message("'${src}' includes the <numeric> header. Replace with <thrust/detail/numeric_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
+    count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
+    count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
+
+    if (NOT thrust_count EQUAL 0)
+      message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT cub_count EQUAL 0)
+      message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+  endif()
+endforeach()
+
+if (NOT found_errors EQUAL 0)
+  message(FATAL_ERROR "Errors detected.")
+endif()
diff --git a/include/cub/test/cmake/test_install/CMakeLists.txt b/include/cub/test/cmake/test_install/CMakeLists.txt
new file mode 100644
index 0000000..c43e6b3
--- /dev/null
+++ b/include/cub/test/cmake/test_install/CMakeLists.txt
@@ -0,0 +1,93 @@
+# Test that an installation of the project can be located by find_package() call
+# with appropriate prefix settings.
+#
+# Expects CUB_BINARY_DIR to be set to an existing cub build directory.
+
+cmake_minimum_required(VERSION 3.15)
+
+project(CubTestInstall CXX CUDA)
+
+# This will eventually get deleted recursively -- keep that in mind if modifying
+set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install_prefix/")
+
+function(do_manual_install)
+  # Inspired by the VTK-m install tests, we can just glob up all of the
+  # cmake_install.cmake, include (ie. run) them, and they'll effectively
+  # install the project into the current value of CMAKE_INSTALL_PREFIX.
+
+  # Gather all of the install files from CUB's root:
+  file(GLOB_RECURSE install_files
+    LIST_DIRECTORIES False
+    "${CUB_BINARY_DIR}/cmake_install.cmake"
+  )
+
+  message(STATUS "Locating install files...")
+  foreach (install_file IN LISTS install_files)
+    message(STATUS "  * ${install_file}")
+  endforeach()
+
+  message(STATUS "Building install tree...")
+  foreach(install_file IN LISTS install_files)
+    include("${install_file}")
+  endforeach()
+endfunction()
+
+function(do_cleanup)
+  message(STATUS "Removing ${CMAKE_INSTALL_PREFIX}")
+  file(REMOVE_RECURSE "${CMAKE_INSTALL_PREFIX}")
+endfunction()
+
+function(assert_boolean var_name expect)
+  if (expect)
+    if (NOT ${var_name})
+      message(FATAL_ERROR "'${var_name}' is false, expected true.")
+    endif()
+  else()
+    if (${var_name})
+      message(FATAL_ERROR "'${var_name}' is true, expected false.")
+    endif()
+  endif()
+endfunction()
+
+function(assert_target target_name)
+  if (NOT TARGET "${target_name}")
+    message(FATAL_ERROR "Target '${target_name}' not defined.")
+  endif()
+endfunction()
+
+function(find_installed_project)
+  set(CMAKE_PREFIX_PATH "${CMAKE_INSTALL_PREFIX}")
+  find_package(CUB CONFIG)
+
+  if (NOT CUB_FOUND)
+    message(FATAL_ERROR
+      "find_package(CUB) failed. "
+      "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}"
+    )
+  endif()
+
+  # Test some internal config vars to check that this is the expected install:
+  # TODO The cmake_path (3.19) command will provide more robust ways to do this
+
+  # Escape regex special characters in the install prefix, see
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/18580
+  string(REGEX REPLACE "([][+.*()^])" "\\\\\\1"
+    prefix_regex
+    "${CMAKE_INSTALL_PREFIX}"
+  )
+  if (NOT _CUB_INCLUDE_DIR MATCHES "^${prefix_regex}")
+    message(FATAL_ERROR
+      "Found CUB in unexpected location: "
+      " * _CUB_INCLUDE_DIR=${_CUB_INCLUDE_DIR} "
+      " * ExpectedPrefix=${CMAKE_INSTALL_DIR}"
+    )
+  endif()
+
+  assert_target(CUB::CUB)
+
+endfunction()
+
+do_cleanup() # Prepare for new installation
+do_manual_install()
+find_installed_project()
+do_cleanup() # Clean up if successful
diff --git a/include/cub/test/fill_striped.cuh b/include/cub/test/fill_striped.cuh
new file mode 100644
index 0000000..e9df891
--- /dev/null
+++ b/include/cub/test/fill_striped.cuh
@@ -0,0 +1,163 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <type_traits>
+
+template <typename T, typename = int>
+struct has_x : std::false_type
+{};
+
+template <typename T>
+struct has_x<T, decltype((void)T::x, 0)> : std::true_type
+{};
+
+template <typename T, typename = int>
+struct has_y : std::false_type
+{};
+
+template <typename T>
+struct has_y<T, decltype((void)T::y, 0)> : std::true_type
+{};
+
+template <typename T, typename = int>
+struct has_z : std::false_type
+{};
+
+template <typename T>
+struct has_z<T, decltype((void)T::z, 0)> : std::true_type
+{};
+
+template <typename T, typename = int>
+struct has_w : std::false_type
+{};
+
+template <typename T>
+struct has_w<T, decltype((void)T::w, 0)> : std::true_type
+{};
+
+template <typename ScalarT, typename = int>
+struct component_type_impl_t
+{
+  using type = ScalarT;
+};
+
+template <typename VectorT>
+struct component_type_impl_t<VectorT, decltype((void)VectorT::x, 0)>
+{
+  using type = decltype(std::declval<VectorT>().x);
+};
+
+template <typename T>
+using component_type_t = typename component_type_impl_t<T>::type;
+
+template <typename VectorT>
+struct scalar_to_vec_t
+{
+  using component_t = component_type_t<VectorT>;
+
+  template <typename T, typename V = VectorT>
+  __host__ __device__ __forceinline__
+    typename std::enable_if<std::is_same<V, VectorT>::value && !has_x<V>::value, V>::type
+    operator()(T scalar)
+  {
+    return static_cast<component_t>(scalar);
+  }
+
+  template <typename T, typename V = VectorT>
+  __host__ __device__ __forceinline__
+    typename std::enable_if<std::is_same<V, VectorT>::value && has_x<V>::value && !has_y<V>::value,
+                            V>::type
+    operator()(T scalar)
+  {
+    V val;
+    val.x = static_cast<component_t>(scalar);
+    return val;
+  }
+
+  template <typename T, typename V = VectorT>
+  __host__ __device__ __forceinline__
+    typename std::enable_if<std::is_same<V, VectorT>::value && has_y<V>::value && !has_z<V>::value,
+                            V>::type
+    operator()(T scalar)
+  {
+    V val;
+    val.x = static_cast<component_t>(scalar);
+    val.y = static_cast<component_t>(scalar);
+    return val;
+  }
+
+  template <typename T, typename V = VectorT>
+  __host__ __device__ __forceinline__
+    typename std::enable_if<std::is_same<V, VectorT>::value && has_z<V>::value && !has_w<V>::value,
+                            V>::type
+    operator()(T scalar)
+  {
+    V val;
+    val.x = static_cast<component_t>(scalar);
+    val.y = static_cast<component_t>(scalar);
+    val.z = static_cast<component_t>(scalar);
+    return val;
+  }
+
+  template <typename T, typename V = VectorT>
+  __host__ __device__ __forceinline__
+    typename std::enable_if<std::is_same<V, VectorT>::value && has_w<V>::value, V>::type
+    operator()(T scalar)
+  {
+    V val;
+    val.x = static_cast<component_t>(scalar);
+    val.y = static_cast<component_t>(scalar);
+    val.z = static_cast<component_t>(scalar);
+    val.w = static_cast<component_t>(scalar);
+    return val;
+  }
+};
+
+template <int LogicalWarpThreads, int ItemsPerThread, int BlockThreads, typename IteratorT>
+void fill_striped(IteratorT it)
+{
+  using T = cub::detail::value_t<IteratorT>;
+
+  const int warps_in_block = BlockThreads / LogicalWarpThreads;
+  const int items_per_warp = LogicalWarpThreads * ItemsPerThread;
+  scalar_to_vec_t<T> convert;
+
+  for (int warp_id = 0; warp_id < warps_in_block; warp_id++)
+  {
+    const int warp_offset_val = items_per_warp * warp_id;
+
+    for (int lane_id = 0; lane_id < LogicalWarpThreads; lane_id++)
+    {
+      const int lane_offset = warp_offset_val + lane_id;
+
+      for (int item = 0; item < ItemsPerThread; item++)
+      {
+        *(it++) = convert(lane_offset + item * LogicalWarpThreads);
+      }
+    }
+  }
+}
diff --git a/include/cub/test/half.h b/include/cub/test/half.h
new file mode 100644
index 0000000..7ef4e94
--- /dev/null
+++ b/include/cub/test/half.h
@@ -0,0 +1,345 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * Utilities for interacting with the opaque CUDA __half type
+ */
+
+#include <cub/util_type.cuh>
+
+#include <cuda_fp16.h>
+
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+
+#ifdef __GNUC__
+// There's a ton of type-punning going on in this file.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+
+/******************************************************************************
+ * half_t
+ ******************************************************************************/
+
+/**
+ * Host-based fp16 data type compatible and convertible with __half
+ */
+struct half_t
+{
+    uint16_t __x;
+
+    /// Constructor from __half
+    __host__ __device__ __forceinline__
+    half_t(const __half &other)
+    {
+        __x = reinterpret_cast<const uint16_t&>(other);
+    }
+
+    /// Constructor from integer
+    __host__ __device__ __forceinline__
+    half_t(int a)
+    {
+        *this = half_t(float(a));
+    }
+
+    /// Constructor from std::size_t
+    __host__ __device__ __forceinline__
+    half_t(std::size_t a)
+    {
+        *this = half_t(float(a));
+    }
+
+    /// Default constructor
+    half_t() = default;
+
+    /// Constructor from float
+    __host__ __device__ __forceinline__
+    half_t(float a)
+    {
+        // Stolen from Norbert Juffa
+        uint32_t ia = *reinterpret_cast<uint32_t*>(&a);
+        uint16_t ir;
+
+        ir = (ia >> 16) & 0x8000;
+
+        if ((ia & 0x7f800000) == 0x7f800000)
+        {
+            if ((ia & 0x7fffffff) == 0x7f800000)
+            {
+                ir |= 0x7c00; /* infinity */
+            }
+            else
+            {
+                ir = 0x7fff; /* canonical NaN */
+            }
+        }
+        else if ((ia & 0x7f800000) >= 0x33000000)
+        {
+            int32_t shift = (int32_t) ((ia >> 23) & 0xff) - 127;
+            if (shift > 15)
+            {
+                ir |= 0x7c00; /* infinity */
+            }
+            else
+            {
+                ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
+                if (shift < -14)
+                { /* denormal */
+                    ir |= ia >> (-1 - shift);
+                    ia = ia << (32 - (-1 - shift));
+                }
+                else
+                { /* normal */
+                    ir |= ia >> (24 - 11);
+                    ia = ia << (32 - (24 - 11));
+                    ir = static_cast<uint16_t>(ir + ((14 + shift) << 10));
+                }
+                /* IEEE-754 round to nearest of even */
+                if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1)))
+                {
+                    ir++;
+                }
+            }
+        }
+
+        this->__x = ir;
+    }
+
+    /// Cast to __half
+    __host__ __device__ __forceinline__
+    operator __half() const
+    {
+        return reinterpret_cast<const __half&>(__x);
+    }
+
+    /// Cast to float
+    __host__ __device__ __forceinline__
+    operator float() const
+    {
+        // Stolen from Andrew Kerr
+
+        int sign        = ((this->__x >> 15) & 1);
+        int exp         = ((this->__x >> 10) & 0x1f);
+        int mantissa    = (this->__x & 0x3ff);
+        std::uint32_t f = 0;
+
+        if (exp > 0 && exp < 31)
+        {
+            // normal
+            exp += 112;
+            f = (sign << 31) | (exp << 23) | (mantissa << 13);
+        }
+        else if (exp == 0)
+        {
+            if (mantissa)
+            {
+                // subnormal
+                exp += 113;
+                while ((mantissa & (1 << 10)) == 0)
+                {
+                    mantissa <<= 1;
+                    exp--;
+                }
+                mantissa &= 0x3ff;
+                f = (sign << 31) | (exp << 23) | (mantissa << 13);
+            }
+            else if (sign)
+            {
+                f = 0x80000000; // negative zero
+            }
+            else
+            {
+                f = 0x0;        // zero
+            }
+        }
+        else if (exp == 31)
+        {
+            if (mantissa)
+            {
+                f = 0x7fffffff;     // not a number
+            }
+            else
+            {
+                f = (0xff << 23) | (sign << 31);    //  inf
+            }
+        }
+
+	static_assert(sizeof(float) == sizeof(std::uint32_t), "4-byte size check");
+	float ret{};
+	std::memcpy(&ret, &f, sizeof(float));
+	return ret;
+    }
+
+
+    /// Get raw storage
+    __host__ __device__ __forceinline__
+    uint16_t raw() const
+    {
+        return this->__x;
+    }
+
+    /// Equality
+    __host__ __device__ __forceinline__
+    bool operator ==(const half_t &other) const
+    {
+        return (this->__x == other.__x);
+    }
+
+    /// Inequality
+    __host__ __device__ __forceinline__
+    bool operator !=(const half_t &other) const
+    {
+        return (this->__x != other.__x);
+    }
+
+    /// Assignment by sum
+    __host__ __device__ __forceinline__
+    half_t& operator +=(const half_t &rhs)
+    {
+        *this = half_t(float(*this) + float(rhs));
+        return *this;
+    }
+
+    /// Multiply
+    __host__ __device__ __forceinline__
+    half_t operator*(const half_t &other)
+    {
+        return half_t(float(*this) * float(other));
+    }
+    
+    /// Divide
+    __host__ __device__ __forceinline__
+    half_t operator/(const half_t &other) const
+    {
+        return half_t(float(*this) / float(other));
+    }
+
+    /// Add
+    __host__ __device__ __forceinline__
+    half_t operator+(const half_t &other)
+    {
+        return half_t(float(*this) + float(other));
+    }
+    
+    /// Sub
+    __host__ __device__ __forceinline__
+    half_t operator-(const half_t &other) const
+    {
+        return half_t(float(*this) - float(other));
+    }
+
+    /// Less-than
+    __host__ __device__ __forceinline__
+    bool operator<(const half_t &other) const
+    {
+        return float(*this) < float(other);
+    }
+
+    /// Less-than-equal
+    __host__ __device__ __forceinline__
+    bool operator<=(const half_t &other) const
+    {
+        return float(*this) <= float(other);
+    }
+
+    /// Greater-than
+    __host__ __device__ __forceinline__
+    bool operator>(const half_t &other) const
+    {
+        return float(*this) > float(other);
+    }
+
+    /// Greater-than-equal
+    __host__ __device__ __forceinline__
+    bool operator>=(const half_t &other) const
+    {
+        return float(*this) >= float(other);
+    }
+
+    /// numeric_traits<half_t>::max
+    __host__ __device__ __forceinline__
+    static half_t (max)() {
+        uint16_t max_word = 0x7BFF;
+        return reinterpret_cast<half_t&>(max_word);
+    }
+
+    /// numeric_traits<half_t>::lowest
+    __host__ __device__ __forceinline__
+    static half_t lowest() {
+        uint16_t lowest_word = 0xFBFF;
+        return reinterpret_cast<half_t&>(lowest_word);
+    }
+};
+
+
+/******************************************************************************
+ * I/O stream overloads
+ ******************************************************************************/
+
+/// Insert formatted \p half_t into the output stream
+std::ostream& operator<<(std::ostream &out, const half_t &x)
+{
+    out << (float)x;
+    return out;
+}
+
+
+/// Insert formatted \p __half into the output stream
+std::ostream& operator<<(std::ostream &out, const __half &x)
+{
+    return out << half_t(x);
+}
+
+
+/******************************************************************************
+ * Traits overloads
+ ******************************************************************************/
+
+template <>
+struct CUB_NS_QUALIFIER::FpLimits<half_t>
+{
+    static __host__ __device__ __forceinline__ half_t Max() { return (half_t::max)(); }
+
+    static __host__ __device__ __forceinline__ half_t Lowest() { return half_t::lowest(); }
+};
+
+template <>
+struct CUB_NS_QUALIFIER::NumericTraits<half_t>
+    : CUB_NS_QUALIFIER::
+        BaseTraits<FLOATING_POINT, true, false, unsigned short, half_t>
+{};
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
diff --git a/include/cub/test/link_a.cu b/include/cub/test/link_a.cu
new file mode 100644
index 0000000..8a9b19f
--- /dev/null
+++ b/include/cub/test/link_a.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void a()
+{
+    printf("a() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/include/cub/test/link_b.cu b/include/cub/test/link_b.cu
new file mode 100644
index 0000000..a19ec40
--- /dev/null
+++ b/include/cub/test/link_b.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void b()
+{
+    printf("b() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/include/cub/test/link_main.cpp b/include/cub/test/link_main.cpp
new file mode 100644
index 0000000..ef677ee
--- /dev/null
+++ b/include/cub/test/link_main.cpp
@@ -0,0 +1,10 @@
+#include <stdio.h>
+
+extern void a();
+extern void b();
+
+int main()
+{
+    printf("hello world\n");
+    return 0;
+}
diff --git a/include/cub/test/mersenne.h b/include/cub/test/mersenne.h
new file mode 100644
index 0000000..2807ded
--- /dev/null
+++ b/include/cub/test/mersenne.h
@@ -0,0 +1,162 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+ */
+
+#include <stdio.h>
+
+namespace mersenne {
+
+/* Period parameters */
+const unsigned int N          = 624;
+const unsigned int M          = 397;
+const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
+const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
+const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
+
+static unsigned int mt[N];  /* the array for the state vector  */
+static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned int s)
+{
+    mt[0] = s & 0xffffffff;
+    for (mti = 1; mti < static_cast<int>(N); mti++)
+    {
+        mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
+
+        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
+        /* In the previous versions, MSBs of the seed affect   */
+        /* only MSBs of the array mt[].                        */
+        /* 2002/01/09 modified by Makoto Matsumoto             */
+
+        mt[mti] &= 0xffffffff;
+        /* for >32 bit machines */
+    }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned int init_key[], int key_length)
+{
+    int i, j, k;
+    init_genrand(19650218);
+    i = 1;
+    j = 0;
+    k = (static_cast<int>(N) > key_length
+	 ? static_cast<int>(N)
+	 : key_length);
+    for (; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
+            + init_key[j] + j;  /* non linear */
+        mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
+        i++;
+        j++;
+        if (i >= static_cast<int>(N))
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+        if (j >= key_length) j = 0;
+    }
+    for (k = N - 1; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
+        mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
+        i++;
+        if (i >= static_cast<int>(N))
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+    }
+
+    mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned int genrand_int32(void)
+{
+    unsigned int y;
+    static unsigned int mag01[2] = { 0x0, MATRIX_A };
+
+    /* mag01[x] = x * MATRIX_A  for x=0,1 */
+
+    if (mti >= static_cast<int>(N))
+    { /* generate N words at one time */
+        int kk;
+
+        if (mti == N + 1) /* if init_genrand() has not been called, */
+        init_genrand(5489); /* a defat initial seed is used */
+
+        for (kk = 0; kk < static_cast<int>(N - M); kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        for (; kk < static_cast<int>(N - 1); kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+        mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
+
+        mti = 0;
+    }
+
+    y = mt[mti++];
+
+    /* Tempering */
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= (y >> 18);
+
+    return y;
+}
+
+
+
+} // namespace mersenne
diff --git a/include/cub/test/test_allocator.cu b/include/cub/test/test_allocator.cu
new file mode 100644
index 0000000..fcd488b
--- /dev/null
+++ b/include/cub/test/test_allocator.cu
@@ -0,0 +1,452 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for caching allocator of device memory
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--bytes=<timing bytes>]"
+            "[--i=<timing iterations>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get number of GPUs and current GPU
+    int num_gpus;
+    int initial_gpu;
+    int timing_iterations           = 10000;
+    int timing_bytes                = 1024 * 1024;
+
+    if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
+    if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("bytes", timing_bytes);
+
+    // Create default allocator (caches up to 6MB in device allocations per GPU)
+    CachingDeviceAllocator allocator;
+    allocator.debug = true;
+
+    printf("Running single-gpu tests...\n"); fflush(stdout);
+
+    //
+    // Test0
+    //
+
+    // Create a new stream
+    cudaStream_t other_stream;
+    CubDebugExit(cudaStreamCreate(&other_stream));
+
+    // Allocate 999 bytes on the current gpu in stream0
+    char *d_999B_stream0_a;
+    char *d_999B_stream0_b;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_a
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+
+    // Allocate another 999 bytes in stream 0
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Allocate 999 bytes on the current gpu in other_stream
+    char *d_999B_stream_other_a;
+    char *d_999B_stream_other_b;
+    allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);
+
+    // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have one cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Check that we can now use both allocations in other_stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other_a and d_999B_stream_other_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(cudaStreamDestroy(other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Free all cached
+    CubDebugExit(allocator.FreeAllCached());
+
+    //
+    // Test1
+    //
+
+    // Allocate 5 bytes on the current gpu
+    char *d_5B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));
+
+    // Check that that we have zero free bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test2
+    //
+
+    // Allocate 4096 bytes on the current gpu
+    char *d_4096B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    //
+    // Test3
+    //
+
+    // DeviceFree d_5B
+    CubDebugExit(allocator.DeviceFree(d_5B));
+
+    // Check that that we have min_bin_bytes free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test4
+    //
+
+    // DeviceFree d_4096B
+    CubDebugExit(allocator.DeviceFree(d_4096B));
+
+    // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
+
+    // Check that that we have 0 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+    // Check that that we have 2 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 2);
+
+    //
+    // Test5
+    //
+
+    // Allocate 768 bytes on the current gpu
+    char *d_768B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test6
+    //
+
+    // Allocate max_cached_bytes on the current gpu
+    char *d_max_cached;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));
+
+    // DeviceFree d_max_cached
+    CubDebugExit(allocator.DeviceFree(d_max_cached));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we still have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test7
+    //
+
+    // Free all cached blocks on all GPUs
+    CubDebugExit(allocator.FreeAllCached());
+
+    // Check that that we have 0 bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 0 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Check that that still we have 1 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test8
+    //
+
+    // Allocate max cached bytes + 1 on the current gpu
+    char *d_max_cached_plus;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));
+
+    // DeviceFree max cached bytes
+    CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
+
+    // DeviceFree d_768B
+    CubDebugExit(allocator.DeviceFree(d_768B));
+
+    unsigned int power;
+    size_t rounded_bytes;
+    allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
+
+    // Check that that we have 4096 free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+    // Check that that we have 1 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Check that that still we have 0 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+    // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
+
+    if (num_gpus > 1)
+    {
+        printf("\nRunning multi-gpu tests...\n"); fflush(stdout);
+
+        //
+        // Test9
+        //
+
+        // Allocate 768 bytes on the next gpu
+        int next_gpu = (initial_gpu + 1) % num_gpus;
+        char *d_768B_2;
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // DeviceFree d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Re-allocate 768 bytes on the next gpu
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // Re-free d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Check that that we have 4096 free bytes cached on the initial gpu
+        AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+        // Check that that we have 4096 free bytes cached on the second gpu
+        AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
+
+        // Check that that we have 2 cached blocks across all GPUs
+        AssertEquals(allocator.cached_blocks.size(), 2);
+
+        // Check that that still we have 0 live block across all GPUs
+        AssertEquals(allocator.live_blocks.size(), 0);
+    }
+
+    //
+    // Performance
+    //
+
+    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
+    CpuTimer    cpu_timer;
+    char        *d_1024MB                       = NULL;
+    allocator.debug                             = false;
+
+    // Prime the caching allocator and the kernel
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+    CubDebugExit(allocator.DeviceFree(d_1024MB));
+    cub::EmptyKernel<void><<<1, 32>>>();
+
+    // CUDA
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    // CUB
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
+    GpuTimer gpu_timer;
+
+    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // Kernel-only
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        cub::EmptyKernel<void><<<1, 32>>>();
+    }
+    gpu_timer.Stop();
+    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // CUDA
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    // CUB
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+    printf("Success\n");
+
+    return 0;
+}
+
diff --git a/include/cub/test/test_block_radix_rank.cu b/include/cub/test/test_block_radix_rank.cu
new file mode 100644
index 0000000..7ade492
--- /dev/null
+++ b/include/cub/test/test_block_radix_rank.cu
@@ -0,0 +1,343 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#include "cub/util_type.cuh"
+#define CUB_STDERR
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+
+#include "test_util.h"
+#include <stdio.h>
+
+bool g_verbose = false;
+cub::CachingDeviceAllocator g_allocator(true);
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+__launch_bounds__(BlockThreads, 1) __global__ void kernel(Key *d_keys, int *d_ranks)
+{
+  using block_radix_rank = cub::detail::
+    block_radix_rank_t<RankAlgorithm, BlockThreads, RadixBits, Descending, ScanAlgorithm>;
+
+  using storage_t = typename block_radix_rank::TempStorage;
+
+  // Allocate temp storage in shared memory
+  __shared__ storage_t temp_storage;
+
+  // Items per thread
+  Key keys[ItemsPerThread];
+  int ranks[ItemsPerThread];
+
+  constexpr bool uses_warp_striped_arrangement =
+    RankAlgorithm == cub::RadixRankAlgorithm::RADIX_RANK_MATCH ||
+    RankAlgorithm == cub::RadixRankAlgorithm::RADIX_RANK_MATCH_EARLY_COUNTS_ANY ||
+    RankAlgorithm == cub::RadixRankAlgorithm::RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR;
+
+  if (uses_warp_striped_arrangement)
+  {
+    cub::LoadDirectWarpStriped(threadIdx.x, d_keys, keys);
+  }
+  else
+  {
+    cub::LoadDirectBlocked(threadIdx.x, d_keys, keys);
+  }
+
+  cub::BFEDigitExtractor<Key> extractor(0, RadixBits);
+  block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
+
+  if (uses_warp_striped_arrangement)
+  {
+    cub::StoreDirectWarpStriped(threadIdx.x, d_ranks, ranks);
+  }
+  else
+  {
+    cub::StoreDirectBlocked(threadIdx.x, d_ranks, ranks);
+  }
+}
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing
+ */
+template <typename Key>
+struct pair_t
+{
+  Key key;
+  int value;
+
+  bool operator<(const pair_t &b) const { return (key < b.key); }
+};
+
+template <bool DESCENDING, typename Key>
+void Initialize(GenMode gen_mode, Key *h_keys, int *h_reference_ranks, int num_items, int num_bits)
+{
+  std::unique_ptr<pair_t<Key>[]> h_pairs_storage(new pair_t<Key>[num_items]);
+  pair_t<Key> *h_pairs = h_pairs_storage.get();
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    InitValue(gen_mode, h_keys[i], i);
+
+    // Mask off unwanted portions
+    std::uint64_t base = 0;
+    memcpy(&base, &h_keys[i], sizeof(Key));
+    base &= (1ull << num_bits) - 1;
+    memcpy(&h_keys[i], &base, sizeof(Key));
+
+    h_pairs[i].key   = h_keys[i];
+    h_pairs[i].value = i;
+  }
+
+  if (DESCENDING)
+  {
+    std::reverse(h_pairs, h_pairs + num_items);
+  }
+
+  std::stable_sort(h_pairs, h_pairs + num_items);
+
+  if (DESCENDING)
+  {
+    std::reverse(h_pairs, h_pairs + num_items);
+  }
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_reference_ranks[h_pairs[i].value] = i;
+  }
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+void TestDriver(GenMode gen_mode)
+{
+  constexpr int tile_size = BlockThreads * ItemsPerThread;
+
+  // Allocate host arrays
+  std::unique_ptr<Key[]> h_keys(new Key[tile_size]);
+  std::unique_ptr<int[]> h_ranks(new int[tile_size]);
+  std::unique_ptr<int[]> h_reference_ranks(new int[tile_size]);
+
+  // Allocate device arrays
+  Key *d_keys  = nullptr;
+  int *d_ranks = nullptr;
+
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_keys, sizeof(Key) * tile_size));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_ranks, sizeof(int) * tile_size));
+
+  // Initialize problem and solution on host
+  Initialize<Descending>(gen_mode, h_keys.get(), h_reference_ranks.get(), tile_size, RadixBits);
+
+  // Copy problem to device
+  CubDebugExit(cudaMemcpy(d_keys, h_keys.get(), sizeof(Key) * tile_size, cudaMemcpyHostToDevice));
+
+  // Run kernel
+  kernel<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>
+    <<<1, BlockThreads>>>(d_keys, d_ranks);
+
+  // Flush kernel output / errors
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Check keys results
+  const bool compare =
+    CompareDeviceResults(h_reference_ranks.get(), d_ranks, tile_size, g_verbose, g_verbose);
+  AssertEquals(0, compare);
+
+  if (d_keys)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_keys));
+  }
+
+  if (d_ranks)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_ranks));
+  }
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+void TestValid(cub::Int2Type<true> /*fits_smem_capacity*/)
+{
+  TestDriver<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(
+    UNIFORM);
+
+  TestDriver<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(
+    INTEGER_SEED);
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+void TestValid(cub::Int2Type<false> fits_smem_capacity)
+{}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          bool Descending,
+          typename Key>
+void Test()
+{
+  // Check size of smem storage for the target arch to make sure it will fit
+  using block_radix_rank = cub::detail::
+    block_radix_rank_t<RankAlgorithm, BlockThreads, RadixBits, Descending, ScanAlgorithm>;
+  using storage_t = typename block_radix_rank::TempStorage;
+
+  cub::Int2Type<(sizeof(storage_t) <= 48 * 1024)> fits_smem_capacity;
+
+  TestValid<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(
+    fits_smem_capacity);
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          typename Key>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, true, Key>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, false, Key>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, std::uint8_t>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, std::uint16_t>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm, int BlockThreads, int ItemsPerThread, int RadixBits>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, cub::BLOCK_SCAN_RAKING>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, cub::BLOCK_SCAN_WARP_SCANS>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm, int BlockThreads, int ItemsPerThread>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, 1>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, 5>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm, int BlockThreads>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, 1>();
+  Test<RankAlgorithm, BlockThreads, 4>();
+}
+
+template <int BlockThreads>
+void Test(cub::Int2Type<true> /* multiple of hw warp */)
+{
+  Test<cub::RadixRankAlgorithm::RADIX_RANK_MATCH, BlockThreads>();
+
+  // TODO(senior-zero):
+  // - RADIX_RANK_MATCH_EARLY_COUNTS_ANY
+  // - RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR
+}
+
+template <int BlockThreads>
+void Test(cub::Int2Type<false> /* multiple of hw warp */)
+{}
+
+template <int BlockThreads>
+void Test()
+{
+  Test<cub::RadixRankAlgorithm::RADIX_RANK_BASIC, BlockThreads>();
+  Test<cub::RadixRankAlgorithm::RADIX_RANK_MEMOIZE, BlockThreads>();
+
+  Test<BlockThreads>(cub::Int2Type<(BlockThreads % 32) == 0>{});
+}
+
+int main(int argc, char **argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  Test<16>();
+  Test<32>();
+  Test<128>();
+  Test<130>();
+
+  return 0;
+}
+
diff --git a/include/cub/test/test_cdp_variant_state.cu b/include/cub/test/test_cdp_variant_state.cu
new file mode 100644
index 0000000..2a34842
--- /dev/null
+++ b/include/cub/test/test_cdp_variant_state.cu
@@ -0,0 +1,34 @@
+/*
+*  Copyright 2022 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+#include <cstdlib>
+
+int main()
+{
+  // This test just checks that RDC is enabled and detected properly when using
+  // the %PARAM% system to request CDP support (see the README.md file in
+  // this directory).
+
+  // %PARAM% TEST_CDP cdp 0:1
+
+#ifdef CUB_RDC_ENABLED
+  return (TEST_CDP == 1) ? EXIT_SUCCESS : EXIT_FAILURE;
+#else
+  return (TEST_CDP == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
+#endif
+}
diff --git a/include/cub/test/test_device_adjacent_difference.cu b/include/cub/test/test_device_adjacent_difference.cu
new file mode 100644
index 0000000..d01a00d
--- /dev/null
+++ b/include/cub/test/test_device_adjacent_difference.cu
@@ -0,0 +1,701 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_adjacent_difference.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+
+#include <limits>
+#include <memory>
+
+#include "test_util.h"
+
+
+using namespace cub;
+
+
+constexpr bool READ_LEFT = true;
+constexpr bool READ_RIGHT = false;
+
+
+/**
+ * \brief Generates integer sequence \f$S_n=i(i-1)/2\f$.
+ *
+ * The adjacent difference of this sequence produce consecutive numbers:
+ * \f[
+ *   p = \frac{i(i - 1)}{2} \\
+ *   n = \frac{(i + 1) i}{2} \\
+ *   n - p = i \\
+ *   \frac{(i + 1) i}{2} - \frac{i (i - 1)}{2} = i \\
+ *   (i + 1) i - i (i - 1) = 2 i \\
+ *   (i + 1) - (i - 1) = 2 \\
+ *   2 = 2
+ * \f]
+ */
+template <typename DestT>
+struct TestSequenceGenerator
+{
+  template <typename SourceT>
+  __device__ __host__ DestT operator()(SourceT index) const
+  {
+    return static_cast<DestT>(index * (index - 1) / SourceT(2));
+  }
+};
+
+
+template <typename OutputT>
+struct CustomDifference
+{
+  template <typename InputT>
+  __device__ OutputT operator()(const InputT &lhs, const InputT &rhs)
+  {
+    return static_cast<OutputT>(lhs - rhs);
+  }
+};
+
+template <bool ReadLeft,
+          typename IteratorT,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void AdjacentDifference(void *temp_storage,
+                        std::size_t &temp_storage_bytes,
+                        IteratorT it,
+                        DifferenceOpT difference_op,
+                        NumItemsT num_items)
+{
+  const bool is_default_op_in_use =
+    std::is_same<DifferenceOpT, cub::Difference>::value;
+
+  if (ReadLeft)
+  {
+    if (is_default_op_in_use)
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractLeft(temp_storage,
+                                                    temp_storage_bytes,
+                                                    it,
+                                                    num_items));
+    }
+    else
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractLeft(temp_storage,
+                                                    temp_storage_bytes,
+                                                    it,
+                                                    num_items,
+                                                    difference_op));
+    }
+  }
+  else
+  {
+    if (is_default_op_in_use)
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractRight(temp_storage,
+                                                     temp_storage_bytes,
+                                                     it,
+                                                     num_items));
+    }
+    else
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractRight(temp_storage,
+                                                     temp_storage_bytes,
+                                                     it,
+                                                     num_items,
+                                                     difference_op));
+    }
+  }
+}
+
+
+template <bool ReadLeft,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void AdjacentDifferenceCopy(void *temp_storage,
+                            std::size_t &temp_storage_bytes,
+                            InputIteratorT input,
+                            OutputIteratorT output,
+                            DifferenceOpT difference_op,
+                            NumItemsT num_items)
+{
+  const bool is_default_op_in_use =
+    std::is_same<DifferenceOpT, cub::Difference>::value;
+
+  if (ReadLeft)
+  {
+    if (is_default_op_in_use)
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractLeftCopy(temp_storage,
+                                                        temp_storage_bytes,
+                                                        input,
+                                                        output,
+                                                        num_items));
+    }
+    else
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractLeftCopy(temp_storage,
+                                                        temp_storage_bytes,
+                                                        input,
+                                                        output,
+                                                        num_items,
+                                                        difference_op));
+    }
+  }
+  else
+  {
+    if (is_default_op_in_use)
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractRightCopy(temp_storage,
+                                                         temp_storage_bytes,
+                                                         input,
+                                                         output,
+                                                         num_items));
+    }
+    else
+    {
+      CubDebugExit(
+        cub::DeviceAdjacentDifference::SubtractRightCopy(temp_storage,
+                                                         temp_storage_bytes,
+                                                         input,
+                                                         output,
+                                                         num_items,
+                                                         difference_op));
+    }
+  }
+}
+
+template <bool ReadLeft,
+          typename IteratorT,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void AdjacentDifference(IteratorT it,
+                        DifferenceOpT difference_op,
+                        NumItemsT num_items)
+{
+  std::size_t temp_storage_bytes {};
+
+  AdjacentDifference<ReadLeft>(nullptr,
+                               temp_storage_bytes,
+                               it,
+                               difference_op,
+                               num_items);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  AdjacentDifference<ReadLeft>(thrust::raw_pointer_cast(temp_storage.data()),
+                               temp_storage_bytes,
+                               it,
+                               difference_op,
+                               num_items);
+}
+
+
+template <bool ReadLeft,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void AdjacentDifferenceCopy(InputIteratorT input,
+                            OutputIteratorT output,
+                            DifferenceOpT difference_op,
+                            NumItemsT num_items)
+{
+  std::size_t temp_storage_bytes{};
+
+  AdjacentDifferenceCopy<ReadLeft>(nullptr,
+                                   temp_storage_bytes,
+                                   input,
+                                   output,
+                                   difference_op,
+                                   num_items);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  AdjacentDifferenceCopy<ReadLeft>(thrust::raw_pointer_cast(
+                                     temp_storage.data()),
+                                   temp_storage_bytes,
+                                   input,
+                                   output,
+                                   difference_op,
+                                   num_items);
+}
+
+template <typename FirstIteratorT,
+          typename SecondOperatorT>
+bool CheckResult(FirstIteratorT first_begin,
+                 FirstIteratorT first_end,
+                 SecondOperatorT second_begin)
+{
+  auto err = thrust::mismatch(first_begin, first_end, second_begin);
+
+  if (err.first != first_end)
+  {
+    return false;
+  }
+
+  return true;
+}
+
+
+template <typename InputT,
+          typename OutputT,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void TestCopy(NumItemsT elements, DifferenceOpT difference_op)
+{
+  thrust::device_vector<InputT> input(elements);
+  thrust::tabulate(input.begin(),
+                   input.end(),
+                   TestSequenceGenerator<InputT>{});
+
+  thrust::device_vector<OutputT> output(elements, OutputT{42});
+
+  InputT *d_input = thrust::raw_pointer_cast(input.data());
+  OutputT *d_output = thrust::raw_pointer_cast(output.data());
+
+  using CountingIteratorT =
+    typename thrust::counting_iterator<OutputT,
+                                       thrust::use_default,
+                                       std::size_t,
+                                       std::size_t>;
+
+  AdjacentDifferenceCopy<READ_LEFT>(d_input,
+                                    d_output,
+                                    difference_op,
+                                    elements);
+
+  AssertTrue(CheckResult(output.begin() + 1,
+                         output.end(),
+                         CountingIteratorT(OutputT{0})));
+
+  thrust::fill(output.begin(), output.end(), OutputT{42});
+
+  AdjacentDifferenceCopy<READ_RIGHT>(d_input,
+                                     d_output,
+                                     difference_op,
+                                     elements);
+
+  thrust::device_vector<OutputT> reference(input.size());
+  thrust::sequence(reference.begin(),
+                   reference.end(),
+                   static_cast<OutputT>(0),
+                   static_cast<OutputT>(-1));
+  AssertTrue(CheckResult(output.begin(),
+                         output.end() - 1,
+                         reference.begin()));
+}
+
+
+template <typename InputT,
+          typename OutputT,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void TestIteratorCopy(NumItemsT elements, DifferenceOpT difference_op)
+{
+  thrust::device_vector<InputT> input(elements);
+  thrust::tabulate(input.begin(),
+                   input.end(),
+                   TestSequenceGenerator<InputT>{});
+
+  thrust::device_vector<OutputT> output(elements, OutputT{42});
+
+  using CountingIteratorT =
+  typename thrust::counting_iterator<OutputT,
+    thrust::use_default,
+    std::size_t,
+    std::size_t>;
+
+  AdjacentDifferenceCopy<READ_LEFT>(input.cbegin(),
+                                    output.begin(),
+                                    difference_op,
+                                    elements);
+
+  AssertTrue(CheckResult(output.begin() + 1,
+                         output.end(),
+                         CountingIteratorT(OutputT{0})));
+
+  thrust::fill(output.begin(), output.end(), OutputT{42});
+
+  AdjacentDifferenceCopy<READ_RIGHT>(input.cbegin(),
+                                     output.begin(),
+                                     difference_op,
+                                     elements);
+
+  thrust::device_vector<OutputT> reference(input.size());
+  thrust::sequence(reference.begin(),
+                   reference.end(),
+                   static_cast<OutputT>(0),
+                   static_cast<OutputT>(-1));
+  AssertTrue(CheckResult(output.begin(),
+                         output.end() - 1,
+                         reference.begin()));
+}
+
+
+template <typename InputT,
+          typename OutputT,
+          typename NumItemsT>
+void TestCopy(NumItemsT elements)
+{
+  TestCopy<InputT, OutputT>(elements, cub::Difference{});
+  TestCopy<InputT, OutputT>(elements, CustomDifference<OutputT>{});
+
+  TestIteratorCopy<InputT, OutputT>(elements, cub::Difference{});
+  TestIteratorCopy<InputT, OutputT>(elements, CustomDifference<OutputT>{});
+}
+
+
+template <typename NumItemsT>
+void TestCopy(NumItemsT elements)
+{
+  TestCopy<std::uint64_t, std::int64_t >(elements);
+  TestCopy<std::uint32_t, std::int32_t>(elements);
+}
+
+
+template <typename T,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void Test(NumItemsT elements, DifferenceOpT difference_op)
+{
+  thrust::device_vector<T> data(elements);
+  thrust::tabulate(data.begin(),
+                   data.end(),
+                   TestSequenceGenerator<T>{});
+
+  T *d_data = thrust::raw_pointer_cast(data.data());
+
+  using CountingIteratorT =
+    typename thrust::counting_iterator<T,
+      thrust::use_default,
+      std::size_t,
+      std::size_t>;
+
+  AdjacentDifference<READ_LEFT>(d_data,
+                                difference_op,
+                                elements);
+
+  AssertTrue(CheckResult(data.begin() + 1,
+                         data.end(),
+                         CountingIteratorT(T{0})));
+
+
+  thrust::tabulate(data.begin(),
+                   data.end(),
+                   TestSequenceGenerator<T>{});
+
+  AdjacentDifference<READ_RIGHT>(d_data,
+                                 difference_op,
+                                 elements);
+
+  thrust::device_vector<T> reference(data.size());
+  thrust::sequence(reference.begin(),
+                   reference.end(),
+                   static_cast<T>(0),
+                   static_cast<T>(-1));
+  AssertTrue(CheckResult(data.begin(),
+                         data.end() - 1,
+                         reference.begin()));
+}
+
+
+template <typename T,
+          typename DifferenceOpT,
+          typename NumItemsT>
+void TestIterators(NumItemsT elements, DifferenceOpT difference_op)
+{
+  thrust::device_vector<T> data(elements);
+  thrust::tabulate(data.begin(),
+                   data.end(),
+                   TestSequenceGenerator<T>{});
+
+  using CountingIteratorT =
+  typename thrust::counting_iterator<T,
+    thrust::use_default,
+    std::size_t,
+    std::size_t>;
+
+  AdjacentDifference<READ_LEFT>(data.begin(),
+                                difference_op,
+                                elements);
+
+  AssertTrue(CheckResult(data.begin() + 1,
+                         data.end(),
+                         CountingIteratorT(T{0})));
+
+
+  thrust::tabulate(data.begin(),
+                   data.end(),
+                   TestSequenceGenerator<T>{});
+
+  AdjacentDifference<READ_RIGHT>(data.begin(),
+                                 difference_op,
+                                 elements);
+
+  thrust::device_vector<T> reference(data.size());
+  thrust::sequence(reference.begin(),
+                   reference.end(),
+                   static_cast<T>(0),
+                   static_cast<T>(-1));
+
+  AssertTrue(CheckResult(data.begin(), data.end() - 1, reference.begin()));
+}
+
+
+template <typename T,
+          typename NumItemsT>
+void Test(NumItemsT elements)
+{
+  Test<T>(elements, cub::Difference{});
+  Test<T>(elements, CustomDifference<T>{});
+
+  TestIterators<T>(elements, cub::Difference{});
+  TestIterators<T>(elements, CustomDifference<T>{});
+}
+
+
+template <typename NumItemsT>
+void Test(NumItemsT elements)
+{
+  Test<std::int32_t, NumItemsT>(elements);
+  Test<std::uint32_t, NumItemsT>(elements);
+  Test<std::uint64_t, NumItemsT>(elements);
+}
+
+
+template <typename ValueT,
+          typename NumItemsT>
+void TestFancyIterators(NumItemsT elements)
+{
+  if (elements == 0)
+  {
+    return;
+  }
+
+  thrust::counting_iterator<ValueT> count_iter(ValueT{1});
+  thrust::device_vector<ValueT> output(elements, ValueT{42});
+
+  AdjacentDifferenceCopy<READ_LEFT>(count_iter,
+                                    output.begin(),
+                                    cub::Difference{},
+                                    elements);
+  AssertEquals(elements,
+               static_cast<NumItemsT>(
+                 thrust::count(output.begin(), output.end(), ValueT(1))));
+
+  thrust::fill(output.begin(), output.end(), ValueT{});
+  AdjacentDifferenceCopy<READ_RIGHT>(count_iter,
+                                     output.begin(),
+                                     cub::Difference{},
+                                     elements);
+  AssertEquals(elements - 1,
+               static_cast<NumItemsT>(
+                 thrust::count(output.begin(),
+                               output.end() - 1,
+                               static_cast<ValueT>(-1))));
+  AssertEquals(output.back(), static_cast<ValueT>(elements));
+
+  thrust::constant_iterator<ValueT> const_iter(ValueT{});
+
+  AdjacentDifferenceCopy<READ_LEFT>(const_iter,
+                                    output.begin(),
+                                    cub::Difference{},
+                                    elements);
+  AssertEquals(elements,
+               static_cast<NumItemsT>(
+                 thrust::count(output.begin(), output.end(), ValueT{})));
+
+  thrust::fill(output.begin(), output.end(), ValueT{});
+  AdjacentDifferenceCopy<READ_RIGHT>(const_iter,
+                                     output.begin(),
+                                     cub::Difference{},
+                                     elements);
+  AssertEquals(elements,
+               static_cast<NumItemsT>(
+                 thrust::count(output.begin(), output.end(), ValueT{})));
+
+  AdjacentDifferenceCopy<READ_LEFT>(const_iter,
+                                    thrust::make_discard_iterator(),
+                                    cub::Difference{},
+                                    elements);
+
+  AdjacentDifferenceCopy<READ_RIGHT>(const_iter,
+                                     thrust::make_discard_iterator(),
+                                     cub::Difference{},
+                                     elements);
+}
+
+
+template <typename NumItemsT>
+void TestFancyIterators(NumItemsT elements)
+{
+  TestFancyIterators<std::uint64_t, NumItemsT>(elements);
+}
+
+
+template <typename NumItemsT>
+void TestSize(NumItemsT elements)
+{
+  Test(elements);
+  TestCopy(elements);
+  TestFancyIterators(elements);
+}
+
+struct DetectWrongDifference
+{
+  bool *flag;
+
+  __host__ __device__ DetectWrongDifference operator++() const
+  {
+    return *this;
+  }
+  __host__ __device__ DetectWrongDifference operator*() const
+  {
+    return *this;
+  }
+  template <typename Difference>
+  __host__ __device__ DetectWrongDifference operator+(Difference) const
+  {
+    return *this;
+  }
+  template <typename Index>
+  __host__ __device__ DetectWrongDifference operator[](Index) const
+  {
+    return *this;
+  }
+
+  __device__ void operator=(long long difference) const
+  {
+    if (difference != 1)
+    {
+      *flag = false;
+    }
+  }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+  const std::size_t elements = 1ll << magnitude;
+
+  thrust::device_vector<bool> all_differences_correct(1, true);
+
+  thrust::counting_iterator<long long> in(1);
+
+  DetectWrongDifference out = {
+    thrust::raw_pointer_cast(all_differences_correct.data())
+  };
+
+  AdjacentDifferenceCopy<READ_LEFT>(in, out, cub::Difference{}, elements);
+  AssertEquals(all_differences_correct.front(), true);
+}
+
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+  TestAdjacentDifferenceWithBigIndexesHelper(30);
+  TestAdjacentDifferenceWithBigIndexesHelper(31);
+  TestAdjacentDifferenceWithBigIndexesHelper(32);
+  TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+
+struct InvocationsCounter
+{
+  int *m_d_counts{};
+
+  explicit InvocationsCounter(int *d_counts) : m_d_counts(d_counts) {}
+
+  __device__ int operator()(int l, int /* r */) const 
+  {
+    atomicAdd(m_d_counts + l, 1);
+    return l;
+  }
+};
+
+void TestAdjacentDifferenceOpInvocationsNum(int num_items)
+{
+  auto in = thrust::make_counting_iterator(0);
+  auto out = thrust::make_discard_iterator();
+
+  thrust::device_vector<int> num_of_invocations(num_items, 0);
+  InvocationsCounter op{thrust::raw_pointer_cast(num_of_invocations.data())};
+
+  AdjacentDifferenceCopy<READ_LEFT>(in, out, op, num_items);
+  AssertEquals(
+    num_items - 1,
+    thrust::count(num_of_invocations.begin() + 1, num_of_invocations.end(), 1));
+  AssertEquals(0, num_of_invocations[0]);
+
+  thrust::fill_n(num_of_invocations.begin(), num_items, 0);
+  AdjacentDifferenceCopy<READ_RIGHT>(in, out, op, num_items);
+  AssertEquals(
+    num_items - 1,
+    thrust::count(num_of_invocations.begin(), num_of_invocations.end() - 1, 1));
+  AssertEquals(0, num_of_invocations[num_items - 1]);
+}
+
+void TestAdjacentDifferenceOpInvocationsNum()
+{
+  for (int num_items = 1; num_items < 4096; num_items *= 2)
+  {
+    TestAdjacentDifferenceOpInvocationsNum(num_items);
+  }
+}
+
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  TestSize(0);
+  for (std::size_t power_of_two = 2; power_of_two < 20; power_of_two += 2)
+  {
+    TestSize(1ull << power_of_two);
+  }
+  TestAdjacentDifferenceWithBigIndexes();
+  TestAdjacentDifferenceOpInvocationsNum();
+
+  return 0;
+}
diff --git a/include/cub/test/test_device_batch_copy.cu b/include/cub/test/test_device_batch_copy.cu
new file mode 100644
index 0000000..91ce6f3
--- /dev/null
+++ b/include/cub/test/test_device_batch_copy.cu
@@ -0,0 +1,523 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_copy.cuh>
+#include <cub/util_ptx.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+#include "test_util.h"
+
+/**
+ * @brief Host-side random data generation
+ */
+template <typename T>
+void GenerateRandomData(
+  T *rand_out,
+  const std::size_t num_items,
+  const T min_rand_val          = std::numeric_limits<T>::min(),
+  const T max_rand_val          = std::numeric_limits<T>::max(),
+  const std::uint_fast32_t seed = 320981U,
+  typename std::enable_if<std::is_integral<T>::value && (sizeof(T) >= 2)>::type * = nullptr)
+{
+  // initialize random number generator
+  std::mt19937 rng(seed);
+  std::uniform_int_distribution<T> uni_dist(min_rand_val, max_rand_val);
+
+  // generate random numbers
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    rand_out[i] = uni_dist(rng);
+  }
+}
+
+/**
+ * @brief Used for generating a shuffled but cohesive sequence of output-range offsets for the
+ * sequence of input-ranges.
+ */
+template <typename RangeOffsetT, typename ByteOffsetT, typename RangeSizeT>
+std::vector<ByteOffsetT> GetShuffledRangeOffsets(const std::vector<RangeSizeT> &range_sizes,
+                                                 const std::uint_fast32_t seed = 320981U)
+{
+  RangeOffsetT num_ranges = static_cast<RangeOffsetT>(range_sizes.size());
+
+  // We're remapping the i-th range to pmt_idxs[i]
+  std::mt19937 rng(seed);
+  std::vector<RangeOffsetT> pmt_idxs(num_ranges);
+  std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast<RangeOffsetT>(0));
+  std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng);
+
+  // Compute the offsets using the new mapping
+  ByteOffsetT running_offset = {};
+  std::vector<ByteOffsetT> permuted_offsets;
+  permuted_offsets.reserve(num_ranges);
+  for (auto permuted_range_idx : pmt_idxs)
+  {
+    permuted_offsets.emplace_back(running_offset);
+    running_offset += range_sizes[permuted_range_idx];
+  }
+
+  // Generate the scatter indexes that identify where each range was mapped to
+  std::vector<RangeOffsetT> scatter_idxs(num_ranges);
+  for (RangeOffsetT i = 0; i < num_ranges; i++)
+  {
+    scatter_idxs[pmt_idxs[i]] = i;
+  }
+
+  std::vector<ByteOffsetT> new_offsets(num_ranges);
+  for (RangeOffsetT i = 0; i < num_ranges; i++)
+  {
+    new_offsets[i] = permuted_offsets[scatter_idxs[i]];
+  }
+
+  return new_offsets;
+}
+
+template <size_t n, typename... T>
+typename std::enable_if<n >= thrust::tuple_size<thrust::tuple<T...>>::value>::type
+print_tuple(std::ostream &, const thrust::tuple<T...> &)
+{}
+
+template <size_t n, typename... T>
+typename std::enable_if<n + 1 <= thrust::tuple_size<thrust::tuple<T...>>::value>::type
+print_tuple(std::ostream &os, const thrust::tuple<T...> &tup)
+{
+  if (n != 0)
+    os << ", ";
+  os << thrust::get<n>(tup);
+  print_tuple<n + 1>(os, tup);
+}
+
+template <typename... T>
+std::ostream &operator<<(std::ostream &os, const thrust::tuple<T...> &tup)
+{
+  os << "[";
+  print_tuple<0>(os, tup);
+  return os << "]";
+}
+
+struct Identity
+{
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(T x)
+  {
+    return x;
+  }
+};
+
+/**
+ * @brief Function object class template that takes an offset and returns an iterator at the given
+ * offset relative to a fixed base iterator.
+ *
+ * @tparam IteratorT The random-access iterator type to be returned
+ */
+template <typename IteratorT>
+struct OffsetToIteratorOp
+{
+  template <typename OffsetT>
+  __host__ __device__ __forceinline__ thrust::transform_output_iterator<Identity, IteratorT>
+  operator()(OffsetT offset) const
+  {
+    return thrust::make_transform_output_iterator(base_it + offset, Identity{});
+  }
+  IteratorT base_it;
+};
+
+template <typename AtomicT>
+struct RepeatIndex
+{
+  template <typename OffsetT>
+  __host__ __device__ __forceinline__ thrust::constant_iterator<AtomicT> operator()(OffsetT i)
+  {
+    return thrust::constant_iterator<AtomicT>(static_cast<AtomicT>(i));
+  }
+};
+
+enum class TestDataGen
+{
+  // Random offsets into a data segment
+  RANDOM,
+
+  // Ranges cohesively reside next to each other
+  CONSECUTIVE
+};
+
+/**
+ * @brief
+ *
+ * @tparam AtomicT The type of the elements being copied
+ * @tparam RangeOffsetT Type used for indexing into the array of ranges
+ * @tparam RangeSizeT Type used for indexing into individual elements of a range (large enough to
+ * cover the max range size)
+ * @tparam ByteOffsetT Type used for indexing into elements over *all* the ranges' sizes
+ */
+template <typename AtomicT, typename RangeOffsetT, typename RangeSizeT, typename ByteOffsetT>
+void RunTest(RangeOffsetT num_ranges,
+             RangeSizeT min_range_size,
+             RangeSizeT max_range_size,
+             TestDataGen output_gen)
+{
+  using SrcPtrT = AtomicT *;
+
+  // Range segment data (their offsets and sizes)
+  std::vector<RangeSizeT> h_range_sizes(num_ranges);
+  thrust::counting_iterator<RangeOffsetT> iota(0);
+  auto d_range_srcs = thrust::make_transform_iterator(iota, RepeatIndex<AtomicT>{});
+  std::vector<ByteOffsetT> h_offsets(num_ranges + 1);
+
+  // Device-side resources
+  AtomicT *d_out            = nullptr;
+  ByteOffsetT *d_offsets    = nullptr;
+  RangeSizeT *d_range_sizes = nullptr;
+  void *d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+
+  // Generate the range sizes
+  GenerateRandomData(h_range_sizes.data(), h_range_sizes.size(), min_range_size, max_range_size);
+
+  // Compute the total bytes to be copied
+  std::partial_sum(h_range_sizes.begin(), h_range_sizes.end(), h_offsets.begin() + 1);
+  const ByteOffsetT num_total_items = h_offsets.back();
+  const ByteOffsetT num_total_bytes = num_total_items * static_cast<ByteOffsetT>(sizeof(AtomicT));
+
+  h_offsets.pop_back();
+
+  constexpr int32_t shuffle_seed = 123241;
+
+  // Shuffle output range source-offsets
+  if (output_gen == TestDataGen::RANDOM)
+  {
+    h_offsets = GetShuffledRangeOffsets<RangeOffsetT, ByteOffsetT>(h_range_sizes, shuffle_seed);
+  }
+
+  // Initialize d_range_dsts
+  OffsetToIteratorOp<AtomicT *> dst_transform_op{d_out};
+  auto d_range_dsts = thrust::make_transform_iterator(d_offsets, dst_transform_op);
+
+  // Get temporary storage requirements
+  CubDebugExit(cub::DeviceCopy::Batched(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_range_srcs,
+                                        d_range_dsts,
+                                        d_range_sizes,
+                                        num_ranges));
+
+  // Check if there's sufficient device memory to run this test
+  std::size_t total_required_mem = num_total_bytes +                         //
+                                   (num_ranges * sizeof(d_offsets[0])) +     //
+                                   (num_ranges * sizeof(d_range_sizes[0])) + //
+                                   temp_storage_bytes;                       //
+  if (TotalGlobalMem() < total_required_mem)
+  {
+    std::cout
+      << "Skipping the test due to insufficient device memory\n"                                  //
+      << " - Required: " << total_required_mem << " B, available: " << TotalGlobalMem() << " B\n" //
+      << " - Skipped test instance: "                                                             //
+      << " -> Min. range size: " << min_range_size << ", max. range size: " << max_range_size     //
+      << ", num_ranges: " << num_ranges                                                           //
+      << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE");
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  // Allocate device memory
+  CubDebugExit(cudaMalloc(&d_out, num_total_bytes));
+  CubDebugExit(cudaMalloc(&d_offsets, num_ranges * sizeof(d_offsets[0])));
+  CubDebugExit(cudaMalloc(&d_range_sizes, num_ranges * sizeof(d_range_sizes[0])));
+  CubDebugExit(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+
+  std::unique_ptr<AtomicT[]> h_out(new AtomicT[num_total_items]);
+  std::unique_ptr<AtomicT[]> h_gpu_results(new AtomicT[num_total_items]);
+
+  // Prepare d_range_dsts
+  dst_transform_op.base_it = d_out;
+  d_range_dsts             = thrust::make_transform_iterator(d_offsets, dst_transform_op);
+
+  // Prepare d_offsets
+  CubDebugExit(cudaMemcpyAsync(d_offsets,
+                               h_offsets.data(),
+                               h_offsets.size() * sizeof(h_offsets[0]),
+                               cudaMemcpyHostToDevice,
+                               stream));
+
+  // Prepare d_range_sizes
+  CubDebugExit(cudaMemcpyAsync(d_range_sizes,
+                               h_range_sizes.data(),
+                               h_range_sizes.size() * sizeof(h_range_sizes[0]),
+                               cudaMemcpyHostToDevice,
+                               stream));
+
+  // Invoke device-side algorithm being under test
+  CubDebugExit(cub::DeviceCopy::Batched(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_range_srcs,
+                                        d_range_dsts,
+                                        d_range_sizes,
+                                        num_ranges,
+                                        stream));
+
+  // Copy back the output range
+  CubDebugExit(
+    cudaMemcpyAsync(h_gpu_results.get(), d_out, num_total_bytes, cudaMemcpyDeviceToHost, stream));
+
+  // Make sure results have been copied back to the host
+  CubDebugExit(cudaStreamSynchronize(stream));
+
+  // CPU-side result generation for verification
+  for (RangeOffsetT i = 0; i < num_ranges; i++)
+  {
+    std::copy(d_range_srcs[i], d_range_srcs[i] + h_range_sizes[i], h_out.get() + h_offsets[i]);
+  }
+
+  const auto it_pair =
+    std::mismatch(h_gpu_results.get(), h_gpu_results.get() + num_total_items, h_out.get());
+
+  if (it_pair.first != h_gpu_results.get() + num_total_items)
+  {
+    std::cout << "Mismatch at index " << it_pair.first - h_gpu_results.get()
+              << ", CPU vs. GPU: " << *it_pair.second << ", " << *it_pair.first << "\n";
+  }
+  AssertEquals(it_pair.first, h_gpu_results.get() + num_total_items);
+
+  CubDebugExit(cudaFree(d_out));
+  CubDebugExit(cudaFree(d_offsets));
+  CubDebugExit(cudaFree(d_range_sizes));
+  CubDebugExit(cudaFree(d_temp_storage));
+}
+
+struct object_with_non_trivial_ctor
+{
+  static constexpr int MAGIC = 923390;
+
+  int field;
+  int magic;
+
+  __host__ __device__ object_with_non_trivial_ctor()
+  {
+    magic = MAGIC;
+    field = 0;
+  }
+  __host__ __device__ object_with_non_trivial_ctor(int f)
+  {
+    magic = MAGIC;
+    field = f;
+  }
+
+  object_with_non_trivial_ctor(const object_with_non_trivial_ctor &x) = default;
+
+  __host__ __device__ object_with_non_trivial_ctor &operator=(const object_with_non_trivial_ctor &x)
+  {
+    if (magic == MAGIC)
+    {
+      field = x.field;
+    }
+    return *this;
+  }
+};
+
+void nontrivial_constructor_test()
+{
+  const int num_buffers = 3;
+  thrust::device_vector<object_with_non_trivial_ctor> a(num_buffers,
+                                                        object_with_non_trivial_ctor(99));
+  thrust::device_vector<object_with_non_trivial_ctor> b(num_buffers);
+  using iterator = thrust::device_vector<object_with_non_trivial_ctor>::iterator;
+
+  thrust::device_vector<iterator> a_iter{a.begin(), a.begin() + 1, a.begin() + 2};
+
+  thrust::device_vector<iterator> b_iter{b.begin(), b.begin() + 1, b.begin() + 2};
+
+  auto sizes = thrust::make_constant_iterator(1);
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  cub::DeviceCopy::Batched(d_temp_storage,
+                           temp_storage_bytes,
+                           a_iter.begin(),
+                           b_iter.begin(),
+                           sizes,
+                           num_buffers);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  cub::DeviceCopy::Batched(d_temp_storage,
+                           temp_storage_bytes,
+                           a_iter.begin(),
+                           b_iter.begin(),
+                           sizes,
+                           num_buffers);
+
+  for (int i = 0; i < 10; i++)
+  {
+    object_with_non_trivial_ctor ha(a[i]);
+    object_with_non_trivial_ctor hb(b[i]);
+    int ia = ha.field;
+    int ib = hb.field;
+
+    if (ia != ib)
+    {
+      std::cerr << "error: " << ia << " != " << ib << "\n";
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  //---------------------------------------------------------------------
+  // DeviceCopy::Batched tests
+  //---------------------------------------------------------------------
+  // Run the nontrivial constructor test suggested by senior-zero
+  nontrivial_constructor_test();
+  // Type used for indexing into the array of ranges
+  using RangeOffsetT = uint32_t;
+
+  // Type used for indexing into individual elements of a range (large enough to cover the max range
+  using RangeSizeT = uint32_t;
+
+  // Type used for indexing into bytes over *all* the ranges' sizes
+  using ByteOffsetT = uint32_t;
+
+  // Total number of bytes that are targeted to be copied on each run
+  const RangeOffsetT target_copy_size = 64U << 20;
+
+  // The number of randomly
+  constexpr std::size_t num_rnd_range_tests = 32;
+
+  // Each range's size will be random within this interval
+  std::vector<std::pair<std::size_t, std::size_t>> size_ranges = {{0, 1},
+                                                                  {1, 2},
+                                                                  {0, 16},
+                                                                  {1, 32},
+                                                                  {1, 1024},
+                                                                  {1, 32 * 1024},
+                                                                  {128 * 1024, 256 * 1024},
+                                                                  {target_copy_size,
+                                                                   target_copy_size}};
+
+  std::mt19937 rng(0);
+  std::uniform_int_distribution<std::size_t> size_dist(1, 1000000);
+  for (std::size_t i = 0; i < num_rnd_range_tests; i++)
+  {
+    auto range_begin = size_dist(rng);
+    auto range_end   = size_dist(rng);
+    if (range_begin > range_end)
+    {
+      std::swap(range_begin, range_end);
+    }
+    size_ranges.push_back({range_begin, range_end});
+  }
+
+  for (const auto &size_range : size_ranges)
+  {
+    // The most granular type being copied.
+    using AtomicCopyT = int64_t;
+    RangeSizeT min_range_size =
+      static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT)));
+    RangeSizeT max_range_size = static_cast<RangeSizeT>(
+      CUB_ROUND_UP_NEAREST(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT))));
+    double average_range_size = (min_range_size + max_range_size) / 2.0;
+    RangeOffsetT target_num_ranges =
+      static_cast<RangeOffsetT>(target_copy_size / average_range_size);
+
+    // Run tests with output ranges being consecutive
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges,
+                                                                min_range_size,
+                                                                max_range_size,
+                                                                TestDataGen::CONSECUTIVE);
+
+    // Run tests with output ranges being randomly shuffled
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges,
+                                                                min_range_size,
+                                                                max_range_size,
+                                                                TestDataGen::RANDOM);
+  }
+
+  for (const auto &size_range : size_ranges)
+  {
+    // The most granular type being copied.
+    using AtomicCopyT = thrust::tuple<int64_t, int32_t, int16_t, char, char>;
+    RangeSizeT min_range_size =
+      static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT)));
+    RangeSizeT max_range_size = static_cast<RangeSizeT>(
+      CUB_ROUND_UP_NEAREST(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT))));
+    double average_range_size = (min_range_size + max_range_size) / 2.0;
+    RangeOffsetT target_num_ranges =
+      static_cast<RangeOffsetT>(target_copy_size / average_range_size);
+
+    // Run tests with output ranges being consecutive
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges,
+                                                                min_range_size,
+                                                                max_range_size,
+                                                                TestDataGen::CONSECUTIVE);
+
+    // Run tests with output ranges being randomly shuffled
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges,
+                                                                min_range_size,
+                                                                max_range_size,
+                                                                TestDataGen::RANDOM);
+  }
+
+  //---------------------------------------------------------------------
+  // DeviceCopy::Batched test with 64-bit offsets
+  //---------------------------------------------------------------------
+  using ByteOffset64T = uint64_t;
+  using RangeSize64T  = uint64_t;
+  ByteOffset64T large_target_copy_size =
+    static_cast<ByteOffset64T>(std::numeric_limits<uint32_t>::max()) + (128ULL * 1024ULL * 1024ULL);
+  // Make sure min_range_size is in fact smaller than max range size
+  constexpr RangeOffsetT single_range = 1;
+
+  // Run tests with output ranges being consecutive
+  RunTest<uint8_t, RangeOffsetT, RangeSize64T, ByteOffset64T>(single_range,
+                                                              large_target_copy_size,
+                                                              large_target_copy_size,
+                                                              TestDataGen::CONSECUTIVE);
+}
diff --git a/include/cub/test/test_device_batch_memcpy.cu b/include/cub/test/test_device_batch_memcpy.cu
new file mode 100644
index 0000000..dd5ec53
--- /dev/null
+++ b/include/cub/test/test_device_batch_memcpy.cu
@@ -0,0 +1,733 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_memcpy.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_ptx.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+#include "test_util.h"
+
+/**
+ * @brief Host-side random data generation
+ */
+template <typename T>
+void GenerateRandomData(
+  T *rand_out,
+  const std::size_t num_items,
+  const T min_rand_val          = std::numeric_limits<T>::min(),
+  const T max_rand_val          = std::numeric_limits<T>::max(),
+  const std::uint_fast32_t seed = 320981U,
+  typename std::enable_if<std::is_integral<T>::value && (sizeof(T) >= 2)>::type * = nullptr)
+{
+  // initialize random number generator
+  std::mt19937 rng(seed);
+  std::uniform_int_distribution<T> uni_dist(min_rand_val, max_rand_val);
+
+  // generate random numbers
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    rand_out[i] = uni_dist(rng);
+  }
+}
+
+template <typename InputBufferIt,
+          typename OutputBufferIt,
+          typename BufferSizeIteratorT,
+          typename BufferOffsetT>
+void __global__ BaselineBatchMemCpyKernel(InputBufferIt input_buffer_it,
+                                          OutputBufferIt output_buffer_it,
+                                          BufferSizeIteratorT buffer_sizes,
+                                          BufferOffsetT num_buffers)
+{
+  BufferOffsetT gtid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (gtid >= num_buffers)
+  {
+    return;
+  }
+  for (BufferOffsetT i = 0; i < buffer_sizes[gtid]; i++)
+  {
+    reinterpret_cast<uint8_t *>(output_buffer_it[gtid])[i] =
+      reinterpret_cast<uint8_t *>(input_buffer_it[gtid])[i];
+  }
+}
+
+template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
+void InvokeBaselineBatchMemcpy(InputBufferIt input_buffer_it,
+                               OutputBufferIt output_buffer_it,
+                               BufferSizeIteratorT buffer_sizes,
+                               uint32_t num_buffers)
+{
+  constexpr uint32_t block_threads = 128U;
+  uint32_t num_blocks              = (num_buffers + block_threads - 1) / block_threads;
+  BaselineBatchMemCpyKernel<<<num_blocks, block_threads>>>(input_buffer_it,
+                                                           output_buffer_it,
+                                                           buffer_sizes,
+                                                           num_buffers);
+}
+
+template <typename InputBufferIt,
+          typename OutputBufferIt,
+          typename BufferSizeIteratorT,
+          typename BufferOffsetT>
+void __global__ BaselineBatchMemCpyPerBlockKernel(InputBufferIt input_buffer_it,
+                                                  OutputBufferIt output_buffer_it,
+                                                  BufferSizeIteratorT buffer_sizes,
+                                                  BufferOffsetT num_buffers)
+{
+  BufferOffsetT gbid = blockIdx.x;
+  if (gbid >= num_buffers)
+  {
+    return;
+  }
+  for (BufferOffsetT i = threadIdx.x; i < buffer_sizes[gbid] / 8; i += blockDim.x)
+  {
+    reinterpret_cast<uint64_t *>(output_buffer_it[gbid])[i] =
+      reinterpret_cast<uint64_t *>(input_buffer_it[gbid])[i];
+  }
+}
+
+/**
+ * @brief Used for generating a shuffled but cohesive sequence of output-buffer offsets for the
+ * sequence of input-buffers.
+ */
+template <typename BufferOffsetT, typename ByteOffsetT, typename BufferSizeT>
+std::vector<ByteOffsetT> GetShuffledBufferOffsets(const std::vector<BufferSizeT> &buffer_sizes,
+                                                  const std::uint_fast32_t seed = 320981U)
+{
+  BufferOffsetT num_buffers = static_cast<BufferOffsetT>(buffer_sizes.size());
+
+  // We're remapping the i-th buffer to pmt_idxs[i]
+  std::mt19937 rng(seed);
+  std::vector<BufferOffsetT> pmt_idxs(num_buffers);
+  std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast<BufferOffsetT>(0));
+  std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng);
+
+  // Compute the offsets using the new mapping
+  ByteOffsetT running_offset = {};
+  std::vector<ByteOffsetT> permuted_offsets;
+  permuted_offsets.reserve(num_buffers);
+  for (auto permuted_buffer_idx : pmt_idxs)
+  {
+    permuted_offsets.emplace_back(running_offset);
+    running_offset += buffer_sizes[permuted_buffer_idx];
+  }
+
+  // Generate the scatter indexes that identify where each buffer was mapped to
+  std::vector<BufferOffsetT> scatter_idxs(num_buffers);
+  for (BufferOffsetT i = 0; i < num_buffers; i++)
+  {
+    scatter_idxs[pmt_idxs[i]] = i;
+  }
+
+  std::vector<ByteOffsetT> new_offsets(num_buffers);
+  for (BufferOffsetT i = 0; i < num_buffers; i++)
+  {
+    new_offsets[i] = permuted_offsets[scatter_idxs[i]];
+  }
+
+  return new_offsets;
+}
+
+/**
+ * @brief Function object class template that takes an offset and returns an iterator at the given
+ * offset relative to a fixed base iterator.
+ *
+ * @tparam IteratorT The random-access iterator type to be returned
+ */
+template <typename IteratorT>
+struct OffsetToPtrOp
+{
+  template <typename T>
+  __host__ __device__ __forceinline__ IteratorT operator()(T offset) const
+  {
+    return base_it + offset;
+  }
+  IteratorT base_it;
+};
+
+enum class TestDataGen
+{
+  // Random offsets into a data segment
+  RANDOM,
+
+  // Buffers cohesively reside next to each other
+  CONSECUTIVE
+};
+
+/**
+ * @brief
+ *
+ * @tparam AtomicT The most granular type being copied. All source and destination pointers will be
+ * aligned based on this type, the number of bytes being copied will be an integer multiple of this
+ * type's size
+ * @tparam BufferOffsetT Type used for indexing into the array of buffers
+ * @tparam BufferSizeT Type used for indexing into individual bytes of a buffer (large enough to
+ * cover the max buffer size)
+ * @tparam ByteOffsetT Type used for indexing into bytes over *all* the buffers' sizes
+ */
+template <typename AtomicT, typename BufferOffsetT, typename BufferSizeT, typename ByteOffsetT>
+void RunTest(BufferOffsetT num_buffers,
+             BufferSizeT min_buffer_size,
+             BufferSizeT max_buffer_size,
+             TestDataGen input_gen,
+             TestDataGen output_gen)
+{
+  using SrcPtrT = uint8_t *;
+
+  // Buffer segment data (their offsets and sizes)
+  std::vector<BufferSizeT> h_buffer_sizes(num_buffers);
+  std::vector<ByteOffsetT> h_buffer_src_offsets(num_buffers);
+  std::vector<ByteOffsetT> h_buffer_dst_offsets(num_buffers);
+
+  // Device-side resources
+  void *d_in                        = nullptr;
+  void *d_out                       = nullptr;
+  ByteOffsetT *d_buffer_src_offsets = nullptr;
+  ByteOffsetT *d_buffer_dst_offsets = nullptr;
+  BufferSizeT *d_buffer_sizes       = nullptr;
+  void *d_temp_storage              = nullptr;
+  size_t temp_storage_bytes         = 0;
+
+  // Generate the buffer sizes
+  GenerateRandomData(h_buffer_sizes.data(), h_buffer_sizes.size(), min_buffer_size, max_buffer_size);
+
+  // Make sure buffer sizes are a multiple of the most granular unit (one AtomicT) being copied
+  // (round down)
+  for (BufferOffsetT i = 0; i < num_buffers; i++)
+  {
+    h_buffer_sizes[i] = (h_buffer_sizes[i] / sizeof(AtomicT)) * sizeof(AtomicT);
+  }
+
+  // Compute the total bytes to be copied
+  ByteOffsetT num_total_bytes = 0;
+  for (BufferOffsetT i = 0; i < num_buffers; i++)
+  {
+    if (input_gen == TestDataGen::CONSECUTIVE)
+    {
+      h_buffer_src_offsets[i] = num_total_bytes;
+    }
+    if (output_gen == TestDataGen::CONSECUTIVE)
+    {
+      h_buffer_dst_offsets[i] = num_total_bytes;
+    }
+    num_total_bytes += h_buffer_sizes[i];
+  }
+
+  // Shuffle input buffer source-offsets
+  std::uint_fast32_t shuffle_seed = 320981U;
+  if (input_gen == TestDataGen::RANDOM)
+  {
+    h_buffer_src_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes,
+                                                                                shuffle_seed);
+    shuffle_seed += 42;
+  }
+
+  // Shuffle input buffer source-offsets
+  if (output_gen == TestDataGen::RANDOM)
+  {
+    h_buffer_dst_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes,
+                                                                                shuffle_seed);
+  }
+
+  // Get temporary storage requirements
+  CubDebugExit(cub::DeviceMemcpy::Batched(d_temp_storage,
+                                          temp_storage_bytes,
+                                          static_cast<SrcPtrT *>(nullptr),
+                                          static_cast<SrcPtrT *>(nullptr),
+                                          d_buffer_sizes,
+                                          num_buffers));
+
+  // Check if there's sufficient device memory to run this test
+  std::size_t total_required_mem = num_total_bytes +                                 //
+                                   num_total_bytes +                                 //
+                                   (num_buffers * sizeof(d_buffer_src_offsets[0])) + //
+                                   (num_buffers * sizeof(d_buffer_dst_offsets[0])) + //
+                                   (num_buffers * sizeof(d_buffer_sizes[0])) +       //
+                                   temp_storage_bytes;                               //
+  if (TotalGlobalMem() < total_required_mem)
+  {
+    std::cout
+      << "Skipping the test due to insufficient device memory\n"                                  //
+      << " - Required: " << total_required_mem << " B, available: " << TotalGlobalMem() << " B\n" //
+      << " - Skipped test instance: "                                                             //
+      << " -> Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size //
+      << ", num_buffers: " << num_buffers                                                         //
+      << ", in_gen: " << ((input_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE")            //
+      << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE");
+    return;
+  }
+
+  cudaEvent_t events[2];
+  cudaEventCreate(&events[0]);
+  cudaEventCreate(&events[1]);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  // Allocate device memory
+  CubDebugExit(cudaMalloc(&d_in, num_total_bytes));
+  CubDebugExit(cudaMalloc(&d_out, num_total_bytes));
+  CubDebugExit(cudaMalloc(&d_buffer_src_offsets, num_buffers * sizeof(d_buffer_src_offsets[0])));
+  CubDebugExit(cudaMalloc(&d_buffer_dst_offsets, num_buffers * sizeof(d_buffer_dst_offsets[0])));
+  CubDebugExit(cudaMalloc(&d_buffer_sizes, num_buffers * sizeof(d_buffer_sizes[0])));
+  CubDebugExit(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+
+  // Populate the data source with random data
+  using RandomInitAliasT         = uint16_t;
+  std::size_t num_aliased_factor = sizeof(RandomInitAliasT) / sizeof(uint8_t);
+  std::size_t num_aliased_units  = CUB_QUOTIENT_CEILING(num_total_bytes, num_aliased_factor);
+  std::unique_ptr<uint8_t[]> h_in(new uint8_t[num_aliased_units * num_aliased_factor]);
+  std::unique_ptr<uint8_t[]> h_out(new uint8_t[num_total_bytes]);
+  std::unique_ptr<uint8_t[]> h_gpu_results(new uint8_t[num_total_bytes]);
+
+  // Generate random offsets into the random-bits data buffer
+  GenerateRandomData(reinterpret_cast<RandomInitAliasT *>(h_in.get()), num_aliased_units);
+
+  // Prepare d_buffer_srcs
+  OffsetToPtrOp<SrcPtrT> src_transform_op{static_cast<SrcPtrT>(d_in)};
+  cub::TransformInputIterator<SrcPtrT, OffsetToPtrOp<SrcPtrT>, ByteOffsetT *> d_buffer_srcs(
+    d_buffer_src_offsets,
+    src_transform_op);
+
+  // Prepare d_buffer_dsts
+  OffsetToPtrOp<SrcPtrT> dst_transform_op{static_cast<SrcPtrT>(d_out)};
+  cub::TransformInputIterator<SrcPtrT, OffsetToPtrOp<SrcPtrT>, ByteOffsetT *> d_buffer_dsts(
+    d_buffer_dst_offsets,
+    dst_transform_op);
+
+  // Prepare random data segment (which serves for the buffer sources)
+  CubDebugExit(cudaMemcpyAsync(d_in, h_in.get(), num_total_bytes, cudaMemcpyHostToDevice, stream));
+
+  // Prepare d_buffer_src_offsets
+  CubDebugExit(cudaMemcpyAsync(d_buffer_src_offsets,
+                               h_buffer_src_offsets.data(),
+                               h_buffer_src_offsets.size() * sizeof(h_buffer_src_offsets[0]),
+                               cudaMemcpyHostToDevice,
+                               stream));
+
+  // Prepare d_buffer_dst_offsets
+  CubDebugExit(cudaMemcpyAsync(d_buffer_dst_offsets,
+                               h_buffer_dst_offsets.data(),
+                               h_buffer_dst_offsets.size() * sizeof(h_buffer_dst_offsets[0]),
+                               cudaMemcpyHostToDevice,
+                               stream));
+
+  // Prepare d_buffer_sizes
+  CubDebugExit(cudaMemcpyAsync(d_buffer_sizes,
+                               h_buffer_sizes.data(),
+                               h_buffer_sizes.size() * sizeof(h_buffer_sizes[0]),
+                               cudaMemcpyHostToDevice,
+                               stream));
+
+  // Record event before algorithm
+  cudaEventRecord(events[0], stream);
+
+  // Invoke device-side algorithm being under test
+  CubDebugExit(cub::DeviceMemcpy::Batched(d_temp_storage,
+                                          temp_storage_bytes,
+                                          d_buffer_srcs,
+                                          d_buffer_dsts,
+                                          d_buffer_sizes,
+                                          num_buffers,
+                                          stream));
+
+  // Record event after algorithm
+  cudaEventRecord(events[1], stream);
+
+  // Copy back the output buffer
+  CubDebugExit(
+    cudaMemcpyAsync(h_gpu_results.get(), d_out, num_total_bytes, cudaMemcpyDeviceToHost, stream));
+
+  // Make sure results have been copied back to the host
+  CubDebugExit(cudaStreamSynchronize(stream));
+
+  // CPU-side result generation for verification
+  for (BufferOffsetT i = 0; i < num_buffers; i++)
+  {
+    std::memcpy(h_out.get() + h_buffer_dst_offsets[i],
+                h_in.get() + h_buffer_src_offsets[i],
+                h_buffer_sizes[i]);
+  }
+
+  float duration = 0;
+  cudaEventElapsedTime(&duration, events[0], events[1]);
+
+#ifdef CUB_TEST_BENCHMARK
+  size_t stats_src_offsets = sizeof(ByteOffsetT) * num_buffers;
+  size_t stats_dst_offsets = sizeof(ByteOffsetT) * num_buffers;
+  size_t stats_sizes       = sizeof(BufferSizeT) * num_buffers;
+  size_t stats_data_copied = 2 * num_total_bytes;
+
+  std::cout
+    << "Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size     //
+    << ", num_buffers: " << num_buffers                                                         //
+    << ", in_gen: " << ((input_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE")            //
+    << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE")          //
+    << ", src size: " << stats_src_offsets << ", dst size: " << stats_dst_offsets               //
+    << ", sizes size: " << stats_sizes << ", cpy_data_size: " << stats_data_copied              //
+    << ", total: " << (stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) //
+    << ", duration: " << duration                                                               //
+    << ", BW: "
+    << ((double)(stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) /
+        1000000000.0) /
+         (duration / 1000.0)
+    << "GB/s \n";
+#endif
+
+  for (ByteOffsetT i = 0; i < num_total_bytes; i++)
+  {
+    if (h_gpu_results.get()[i] != h_out.get()[i])
+    {
+      std::cout << "Mismatch at index " << i
+                << ", CPU vs. GPU: " << static_cast<uint16_t>(h_gpu_results.get()[i]) << ", "
+                << static_cast<uint16_t>(h_out.get()[i]) << "\n";
+    }
+    AssertEquals(h_out.get()[i], h_gpu_results.get()[i]);
+  }
+
+  CubDebugExit(cudaFree(d_in));
+  CubDebugExit(cudaFree(d_out));
+  CubDebugExit(cudaFree(d_buffer_src_offsets));
+  CubDebugExit(cudaFree(d_buffer_dst_offsets));
+  CubDebugExit(cudaFree(d_buffer_sizes));
+  CubDebugExit(cudaFree(d_temp_storage));
+}
+
+template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT>
+__global__ void TestVectorizedCopyKernel(const void *d_in, void *d_out, ByteOffsetT copy_size)
+{
+  cub::detail::VectorizedCopy<LOGICAL_WARP_SIZE, VectorT>(threadIdx.x, d_out, copy_size, d_in);
+}
+
+struct TupleMemberEqualityOp
+{
+  template <typename T>
+  __host__ __device__ __forceinline__ bool operator()(T tuple)
+  {
+    return thrust::get<0>(tuple) == thrust::get<1>(tuple);
+  }
+};
+
+/**
+ * @brief Tests the VectorizedCopy for various aligned and misaligned input and output pointers.
+ * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
+ */
+template <typename VectorT>
+void TestVectorizedCopy()
+{
+
+  constexpr uint32_t threads_per_block = 8;
+
+  std::vector<std::size_t> in_offsets{0, 1, sizeof(uint32_t) - 1};
+  std::vector<std::size_t> out_offsets{0, 1, sizeof(VectorT) - 1};
+  std::vector<std::size_t> copy_sizes{0,
+                                      1,
+                                      sizeof(uint32_t),
+                                      sizeof(VectorT),
+                                      2 * threads_per_block * sizeof(VectorT)};
+  for (auto copy_sizes_it = std::begin(copy_sizes); copy_sizes_it < std::end(copy_sizes);
+       copy_sizes_it++)
+  {
+    for (auto in_offsets_it = std::begin(in_offsets); in_offsets_it < std::end(in_offsets);
+         in_offsets_it++)
+    {
+      for (auto out_offsets_it = std::begin(out_offsets); out_offsets_it < std::end(out_offsets);
+           out_offsets_it++)
+      {
+        std::size_t in_offset  = *in_offsets_it;
+        std::size_t out_offset = *out_offsets_it;
+        std::size_t copy_size  = *copy_sizes_it;
+
+        // Prepare data
+        const std::size_t alloc_size_in  = in_offset + copy_size;
+        const std::size_t alloc_size_out = out_offset + copy_size;
+        thrust::device_vector<char> data_in(alloc_size_in);
+        thrust::device_vector<char> data_out(alloc_size_out);
+        thrust::sequence(data_in.begin(), data_in.end(), static_cast<char>(0));
+        thrust::fill_n(data_out.begin(), alloc_size_out, static_cast<char>(0x42));
+
+        auto d_in  = thrust::raw_pointer_cast(data_in.data());
+        auto d_out = thrust::raw_pointer_cast(data_out.data());
+
+        TestVectorizedCopyKernel<threads_per_block, VectorT>
+          <<<1, threads_per_block>>>(d_in + in_offset,
+                                     d_out + out_offset,
+                                     static_cast<int>(copy_size));
+        auto zip_it = thrust::make_zip_iterator(data_in.begin() + in_offset,
+                                                data_out.begin() + out_offset);
+
+        bool success = thrust::all_of(zip_it, zip_it + copy_size, TupleMemberEqualityOp{});
+        AssertTrue(success);
+      }
+    }
+  }
+}
+
+template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE, bool PREFER_POW2_BITS>
+__global__ void TestBitPackedCounterKernel(uint32_t *bins,
+                                           uint32_t *increments,
+                                           uint32_t *counts_out,
+                                           uint32_t num_items)
+{
+  using BitPackedCounterT =
+    cub::detail::BitPackedCounter<NUM_ITEMS, MAX_ITEM_VALUE, PREFER_POW2_BITS>;
+  BitPackedCounterT counter{};
+  for (uint32_t i = 0; i < num_items; i++)
+  {
+    counter.Add(bins[i], increments[i]);
+  }
+
+  for (uint32_t i = 0; i < NUM_ITEMS; i++)
+  {
+    counts_out[i] = counter.Get(i);
+  }
+}
+
+/**
+ * @brief Tests BitPackedCounter that's used for computing the histogram of buffer sizes (i.e.,
+ * small, medium, large).
+ */
+template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE>
+void TestBitPackedCounter(const std::uint_fast32_t seed = 320981U)
+{
+
+  constexpr uint32_t min_increment = 0;
+  constexpr uint32_t max_increment = 4;
+  constexpr double avg_increment   = static_cast<double>(min_increment) +
+                                   (static_cast<double>(max_increment - min_increment) / 2.0);
+  std::uint32_t num_increments = 
+      static_cast<uint32_t>(static_cast<double>(MAX_ITEM_VALUE * NUM_ITEMS) / avg_increment);
+
+  // Test input data
+  std::array<uint64_t, NUM_ITEMS> reference_counters{};
+  thrust::host_vector<uint32_t> h_bins(num_increments);
+  thrust::host_vector<uint32_t> h_increments(num_increments);
+
+  // Generate random test input data
+  GenerateRandomData(thrust::raw_pointer_cast(h_bins.data()),
+                     num_increments,
+                     0U,
+                     NUM_ITEMS - 1U,
+                     seed);
+  GenerateRandomData(thrust::raw_pointer_cast(h_increments.data()),
+                     num_increments,
+                     min_increment,
+                     max_increment,
+                     (seed + 17));
+
+  // Make sure test data does not overflow any of the counters
+  for (std::size_t i = 0; i < num_increments; i++)
+  {
+    // New increment for this bin would overflow => zero this increment
+    if (reference_counters[h_bins[i]] + h_increments[i] >= MAX_ITEM_VALUE)
+    {
+      h_increments[i] = 0;
+    }
+    else
+    {
+      reference_counters[h_bins[i]] += h_increments[i];
+    }
+  }
+
+  // Device memory
+  thrust::device_vector<uint32_t> bins_in(num_increments);
+  thrust::device_vector<uint32_t> increments_in(num_increments);
+  thrust::device_vector<uint32_t> counts_out(NUM_ITEMS);
+
+  // Initialize device-side test data
+  bins_in       = h_bins;
+  increments_in = h_increments;
+
+  // Memory for GPU-generated results
+  thrust::host_vector<uint32_t> host_counts(num_increments);
+
+  // Reset counters to arbitrary random value
+  thrust::fill(counts_out.begin(), counts_out.end(), 814920U);
+
+  // Run tests with densely bit-packed counters
+  TestBitPackedCounterKernel<NUM_ITEMS, MAX_ITEM_VALUE, false>
+    <<<1, 1>>>(thrust::raw_pointer_cast(bins_in.data()),
+               thrust::raw_pointer_cast(increments_in.data()),
+               thrust::raw_pointer_cast(counts_out.data()),
+               num_increments);
+
+  // Result verification
+  host_counts = counts_out;
+  for (uint32_t i = 0; i < NUM_ITEMS; i++)
+  {
+    AssertEquals(reference_counters[i], host_counts[i]);
+  }
+
+  // Reset counters to arbitrary random value
+  thrust::fill(counts_out.begin(), counts_out.end(), 814920U);
+
+  // Run tests with bit-packed counters, where bit-count is a power-of-two
+  TestBitPackedCounterKernel<NUM_ITEMS, MAX_ITEM_VALUE, true>
+    <<<1, 1>>>(thrust::raw_pointer_cast(bins_in.data()),
+               thrust::raw_pointer_cast(increments_in.data()),
+               thrust::raw_pointer_cast(counts_out.data()),
+               num_increments);
+
+  // Result verification
+  host_counts = counts_out;
+  for (uint32_t i = 0; i < NUM_ITEMS; i++)
+  {
+    AssertEquals(reference_counters[i], host_counts[i]);
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  //---------------------------------------------------------------------
+  // VectorizedCopy tests
+  //---------------------------------------------------------------------
+  TestVectorizedCopy<uint32_t>();
+  TestVectorizedCopy<uint4>();
+
+  //---------------------------------------------------------------------
+  // BitPackedCounter tests
+  //---------------------------------------------------------------------
+  TestBitPackedCounter<1, 1>();
+  TestBitPackedCounter<1, (0x01U << 16)>();
+  TestBitPackedCounter<4, 1>();
+  TestBitPackedCounter<4, 2>();
+  TestBitPackedCounter<4, 255>();
+  TestBitPackedCounter<4, 256>();
+  TestBitPackedCounter<8, 1024>();
+  TestBitPackedCounter<32, 1>();
+  TestBitPackedCounter<32, 256>();
+
+  //---------------------------------------------------------------------
+  // DeviceMemcpy::Batched tests
+  //---------------------------------------------------------------------
+  // The most granular type being copied. Buffer's will be aligned and their size be an integer
+  // multiple of this type
+  using AtomicCopyT = uint8_t;
+
+  // Type used for indexing into the array of buffers
+  using BufferOffsetT = uint32_t;
+
+  // Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer
+  using BufferSizeT = uint32_t;
+
+  // Type used for indexing into bytes over *all* the buffers' sizes
+  using ByteOffsetT = uint32_t;
+
+  // Total number of bytes that are targeted to be copied on each run
+  const BufferOffsetT target_copy_size = 64U << 20;
+
+  // The number of randomly
+  constexpr std::size_t num_rnd_buffer_range_tests = 32;
+
+  // Each buffer's size will be random within this interval
+  std::vector<std::pair<std::size_t, std::size_t>> buffer_size_ranges = {{0, 1},
+                                                                         {1, 2},
+                                                                         {0, 16},
+                                                                         {1, 32},
+                                                                         {1, 1024},
+                                                                         {1, 32 * 1024},
+                                                                         {128 * 1024, 256 * 1024},
+                                                                         {target_copy_size,
+                                                                          target_copy_size}};
+
+  std::mt19937 rng(0);
+  std::uniform_int_distribution<std::size_t> size_dist(1, 1000000);
+  for (std::size_t i = 0; i < num_rnd_buffer_range_tests; i++)
+  {
+    auto range_begin = size_dist(rng);
+    auto range_end   = size_dist(rng);
+    if (range_begin > range_end)
+    {
+      std::swap(range_begin, range_end);
+    }
+    buffer_size_ranges.push_back({range_begin, range_end});
+  }
+
+  for (const auto &buffer_size_range : buffer_size_ranges)
+  {
+    BufferSizeT min_buffer_size =
+      static_cast<BufferSizeT>(CUB_ROUND_UP_NEAREST(buffer_size_range.first, sizeof(AtomicCopyT)));
+    BufferSizeT max_buffer_size =
+      static_cast<BufferSizeT>(CUB_ROUND_UP_NEAREST(buffer_size_range.second,
+                                                    static_cast<BufferSizeT>(sizeof(AtomicCopyT))));
+    double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0;
+    BufferOffsetT target_num_buffers =
+      static_cast<BufferOffsetT>(target_copy_size / average_buffer_size);
+
+    // Run tests with input buffer being consecutive and output buffers being consecutive
+    RunTest<AtomicCopyT, BufferOffsetT, BufferSizeT, ByteOffsetT>(target_num_buffers,
+                                                                  min_buffer_size,
+                                                                  max_buffer_size,
+                                                                  TestDataGen::CONSECUTIVE,
+                                                                  TestDataGen::CONSECUTIVE);
+
+    // Run tests with input buffer being randomly shuffled and output buffers being randomly
+    // shuffled
+    RunTest<AtomicCopyT, BufferOffsetT, BufferSizeT, ByteOffsetT>(target_num_buffers,
+                                                                  min_buffer_size,
+                                                                  max_buffer_size,
+                                                                  TestDataGen::RANDOM,
+                                                                  TestDataGen::RANDOM);
+  }
+
+  //---------------------------------------------------------------------
+  // DeviceMemcpy::Batched test with 64-bit offsets
+  //---------------------------------------------------------------------
+  using ByteOffset64T = uint64_t;
+  using BufferSize64T = uint64_t;
+  ByteOffset64T large_target_copy_size =
+    static_cast<ByteOffset64T>(std::numeric_limits<uint32_t>::max()) + (128ULL * 1024ULL * 1024ULL);
+  // Make sure min_buffer_size is in fact smaller than max buffer size
+  constexpr BufferOffsetT single_buffer = 1;
+
+  // Run tests with input buffer being consecutive and output buffers being consecutive
+  RunTest<AtomicCopyT, BufferOffsetT, BufferSize64T, ByteOffset64T>(single_buffer,
+                                                                    large_target_copy_size,
+                                                                    large_target_copy_size,
+                                                                    TestDataGen::CONSECUTIVE,
+                                                                    TestDataGen::CONSECUTIVE);
+}
diff --git a/include/cub/test/test_device_histogram.cu b/include/cub/test/test_device_histogram.cu
new file mode 100644
index 0000000..a97ecc3
--- /dev/null
+++ b/include/cub/test/test_device_histogram.cu
@@ -0,0 +1,1684 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_histogram.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <limits>
+#include <typeinfo>
+
+#include "test_util.h"
+
+#define TEST_HALF_T !_NVHPC_CUDA
+
+#if TEST_HALF_T 
+#include <cuda_fp16.h>
+#endif
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+bool                    g_verbose_input     = false;
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceHistogram entrypoints
+//---------------------------------------------------------------------
+
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
+struct Dispatch;
+
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
+struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
+{
+    /**
+     * Dispatch to CUB multi histogram-range entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  * /*d_temp_storage_bytes*/,
+        cudaError_t             * /*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                                ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *(&d_levels)[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram,
+                num_levels,
+                d_levels,
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+
+#if TEST_HALF_T
+    /**
+     * Dispatch to CUB multi histogram-range entrypoint
+     */
+    template <typename CounterT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  * /*d_temp_storage_bytes*/,
+        cudaError_t             * /*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        half_t              *d_samples,                                 ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                                ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        half_t              *(&d_levels)[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                reinterpret_cast<__half*>(d_samples),
+                d_histogram,
+                num_levels,
+                reinterpret_cast<__half *(&)[NUM_ACTIVE_CHANNELS]>(d_levels),
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+#endif
+
+
+    /**
+     * Dispatch to CUB multi histogram-even entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  * /*d_temp_storage_bytes*/,
+        cudaError_t             * /*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *lower_level,           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              *upper_level,           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram,
+                num_levels,
+                lower_level,
+                upper_level,
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+
+#if TEST_HALF_T 
+    /**
+     * Dispatch to CUB multi histogram-even entrypoint
+     */
+    template <typename CounterT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  * /*d_temp_storage_bytes*/,
+        cudaError_t             * /*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        half_t              *d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        half_t              *lower_level,           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        half_t              *upper_level,           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                reinterpret_cast<__half*>(d_samples),
+                d_histogram,
+                num_levels,
+                reinterpret_cast<__half*>(lower_level),
+                reinterpret_cast<__half*>(upper_level),
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+#endif
+};
+
+
+template <>
+struct Dispatch<1, 1, CUB>
+{
+
+    /**
+     * Dispatch to CUB single histogram-range entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              (&d_levels)[1],                         ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramRange(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram[0],
+                num_levels[0],
+                d_levels[0],
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+
+#if TEST_HALF_T
+    template <typename CounterT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Range(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        half_t              *d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        half_t              (&d_levels)[1],                         ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramRange(
+                d_temp_storage,
+                temp_storage_bytes,
+                reinterpret_cast<__half*>(d_samples),
+                d_histogram[0],
+                num_levels[0],
+                d_levels[0].operator __half(),
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+#endif
+
+
+    /**
+     * Dispatch to CUB single histogram-even entrypoint
+     */
+    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              *lower_level,                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              *upper_level,                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramEven(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_histogram[0],
+                num_levels[0],
+                lower_level[0],
+                upper_level[0],
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+
+#if TEST_HALF_T
+    template <typename CounterT, typename OffsetT>
+    //CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Even(
+        int                     timing_timing_iterations,
+        size_t                  */*d_temp_storage_bytes*/,
+        cudaError_t             */*d_cdp_error*/,
+
+        void*               d_temp_storage,
+        size_t&             temp_storage_bytes,
+        half_t              *d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 *num_levels,                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        half_t              *lower_level,                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        half_t              *upper_level,                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    {
+        cudaError_t error = cudaSuccess;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            error = DeviceHistogram::HistogramEven(
+                d_temp_storage,
+                temp_storage_bytes,
+                reinterpret_cast<__half*>(d_samples),
+                d_histogram[0],
+                num_levels[0],
+                lower_level[0].operator __half(),
+                upper_level[0].operator __half(),
+                num_row_pixels,
+                num_rows,
+                row_stride_bytes);
+        }
+        return error;
+    }
+#endif
+
+};
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+// Searches for bin given a list of bin-boundary levels
+template <typename LevelT>
+struct SearchTransform
+{
+    LevelT          *levels;      // Pointer to levels array
+    int             num_levels;   // Number of levels in array
+
+    // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
+        if (bin < 0)
+        {
+            // Sample out of range
+            return num_levels;
+        }
+        return bin;
+    }
+};
+
+// Template to scale samples to evenly-spaced bins
+template <typename LevelT, typename = void>
+struct ScaleTransform;
+
+// [Integral types] Scales samples to evenly-spaced bins
+template <typename LevelT>
+struct ScaleTransform<LevelT,
+                      typename ::cuda::std::enable_if<::cuda::std::is_integral<LevelT>::value>::type>
+{
+  int num_levels; // Number of levels in array
+  LevelT max;     // Max sample level (exclusive)
+  LevelT min;     // Min sample level (inclusive)
+
+  void Init(int num_levels_, // Number of levels in array
+            LevelT max_,     // Max sample level (exclusive)
+            LevelT min_)     // Min sample level (inclusive)
+  {
+    this->num_levels = num_levels_;
+    this->max        = max_;
+    this->min        = min_;
+  }
+
+  // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+  template <typename SampleT>
+  int operator()(SampleT sample)
+  {
+    if ((sample < min) || (sample >= max))
+    {
+      // Sample out of range
+      return num_levels;
+    }
+
+    // Accurate bin computation following the arithmetic we guarantee in the HistoEven docs
+    return static_cast<int>(
+      (static_cast<uint64_t>(sample - min) * static_cast<uint64_t>(num_levels - 1)) /
+      static_cast<uint64_t>(max - min));
+  }
+};
+
+// [[Extended] floating point types] Scales samples to evenly-spaced bins
+template <typename LevelT>
+struct ScaleTransform<LevelT,
+                      typename ::cuda::std::enable_if<::cuda::std::is_floating_point<LevelT>::value
+#if TEST_HALF_T
+                                                      || ::cuda::std::is_same<LevelT, half_t>::value
+#endif
+                                                      >::type>
+{
+  int num_levels; // Number of levels in array
+  LevelT max;     // Max sample level (exclusive)
+  LevelT min;     // Min sample level (inclusive)
+  LevelT scale;   // Bin scaling factor
+
+  void Init(int _num_levels, // Number of levels in array
+            LevelT _max,     // Max sample level (exclusive)
+            LevelT _min)     // Min sample level (inclusive)
+  {
+    this->num_levels = _num_levels;
+    this->max        = _max;
+    this->min        = _min;
+    this->scale      = LevelT{1.0f} /
+                  static_cast<LevelT>((max - min) / static_cast<LevelT>(num_levels - 1));
+  }
+
+  // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+  template <typename SampleT>
+  int operator()(SampleT sample)
+  {
+    if ((sample < min) || (sample >= max))
+    {
+      // Sample out of range
+      return num_levels;
+    }
+
+    return (int)((((float)sample) - min) * scale);
+  }
+};
+
+/**
+ * Generate sample
+ */
+template <typename T, typename LevelT>
+void Sample(T &datum, LevelT max_level, int entropy_reduction)
+{
+    unsigned int max = (unsigned int) -1;
+    unsigned int bits;
+    RandomBits(bits, entropy_reduction);
+    float fraction = (float(bits) / max);
+
+    datum = (T) (fraction * max_level);
+}
+
+
+/**
+ * Initialize histogram samples
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        LevelT,
+    typename        SampleT,
+    typename        OffsetT>
+void InitializeSamples(
+    LevelT          max_level,
+    int             entropy_reduction,
+    SampleT         *h_samples,
+    OffsetT         num_row_pixels,         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    // Initialize samples
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Init sample value
+                Sample(h_samples[offset], max_level, entropy_reduction);
+                if (g_verbose_input)
+                {
+                    if (channel > 0) printf(", ");
+                    std::cout << CoutCast(h_samples[offset]);
+                }
+            }
+        }
+    }
+}
+
+
+/**
+ * Initialize histogram solutions
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        SampleIteratorT,
+    typename        TransformOp,
+    typename        OffsetT>
+void InitializeBins(
+    SampleIteratorT h_samples,
+    int             num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    TransformOp     transform_op[NUM_ACTIVE_CHANNELS],      ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    CounterT        *h_histogram[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    OffsetT         num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+    // Init bins
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
+        {
+            h_histogram[CHANNEL][bin] = 0;
+        }
+    }
+
+    // Initialize samples
+    if (g_verbose_input) printf("Samples: \n");
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            if (g_verbose_input) printf("[");
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Update sample bin
+                int bin = transform_op[channel](h_samples[offset]);
+                if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
+                if ((bin >= 0) && (bin < num_levels[channel] - 1))
+                {
+                    // valid bin
+                    h_histogram[channel][bin]++;
+                }
+            }
+            if (g_verbose_input) printf("]");
+        }
+        if (g_verbose_input) printf("\n\n");
+    }
+}
+
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT,
+    typename        SampleIteratorT>
+void TestEven(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    SampleIteratorT h_samples,
+    SampleIteratorT d_samples)
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogram::Even (%s) "
+           "%d pixels (%d height, %d width, %d-byte row stride), "
+           "%d %d-byte %s samples (entropy reduction %d), "
+           "%s levels, %s counters, %d/%d channels, max sample ",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        (std::is_pointer<SampleIteratorT>::value) ? "pointer" : "iterator",
+        (int) (num_row_pixels * num_rows),
+        (int) num_rows,
+        (int) num_row_pixels,
+        (int) row_stride_bytes,
+        (int) total_samples,
+        (int) sizeof(SampleT),
+        typeid(SampleT).name(),
+        entropy_reduction,
+        typeid(LevelT).name(),
+        typeid(CounterT).name(),
+        NUM_ACTIVE_CHANNELS,
+        NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        std::cout << "\tChannel " << channel << ": "
+                  << num_levels[channel] - 1 << " bins "
+                  << "[" << lower_level[channel] << ", "
+                  << upper_level[channel] << ")\n";
+    }
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+      int bins             = num_levels[channel] - 1;
+      h_histogram[channel] = new CounterT[bins];
+
+      transform_op[channel].Init(num_levels[channel], upper_level[channel], lower_level[channel]);
+    }
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 8;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes);
+
+    // Check canary zones
+    if (g_verbose)
+    {
+        printf("Checking leading temp_storage canary zone (token = %d)\n"
+               "------------------------------------------------------\n",
+               static_cast<int>(canary_token));
+    }
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    if (g_verbose)
+    {
+        printf("Checking trailing temp_storage canary zone (token = %d)\n"
+               "-------------------------------------------------------\n",
+               static_cast<int>(canary_token));
+    }
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (g_verbose)
+        {
+            printf("Checking histogram result (channel = %d)\n"
+                   "----------------------------------------\n",
+                   channel);
+        }
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
+        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even (native pointer input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenNative(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    // Allocate and initialize host sample data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT* d_samples = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        h_samples, d_samples);
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+}
+
+
+/**
+ * Test histogram-even (iterator input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenIterator(
+    Int2Type<false> /*is_half*/,
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    SampleT sample = (SampleT) lower_level[0];
+    ConstantInputIterator<SampleT> sample_itr(sample);
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        sample_itr, sample_itr);
+}
+
+template <Backend,
+          int,
+          int NUM_ACTIVE_CHANNELS,
+          typename,
+          typename,
+          typename LevelT,
+          typename OffsetT>
+void TestEvenIterator(Int2Type<true> /*is_half*/,
+                      LevelT,
+                      int,
+                      int[NUM_ACTIVE_CHANNELS],
+                      LevelT[NUM_ACTIVE_CHANNELS],
+                      LevelT[NUM_ACTIVE_CHANNELS],
+                      OffsetT,
+                      OffsetT,
+                      OffsetT)
+{
+  // We have to reinterpret cast `half_t *` pointer to `__half *` in this test. 
+  // Hence, iterators testing is not supported.
+}
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogram::Range %d pixels "
+           "(%d height, %d width, %d-byte row stride), "
+           "%d %d-byte %s samples (entropy reduction %d), "
+           "%s levels, %s counters, %d/%d channels, max sample ",
+           (BACKEND == CDP) ? "CDP CUB" : "CUB",
+           (int)(num_row_pixels * num_rows),
+           (int)num_rows,
+           (int)num_row_pixels,
+           (int)row_stride_bytes,
+           (int)total_samples,
+           (int)sizeof(SampleT),
+           typeid(SampleT).name(),
+           entropy_reduction,
+           typeid(LevelT).name(),
+           typeid(CounterT).name(),
+           NUM_ACTIVE_CHANNELS,
+           NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        printf("Channel %d: %d bins", channel, num_levels[channel] - 1);
+        if (g_verbose)
+        {
+            std::cout << "[ " << levels[channel][0];
+            for (int level = 1; level < num_levels[channel]; ++level)
+            {
+                std::cout << ", " << levels[channel][level];
+            }
+            printf("]");
+        }
+        printf("\n");
+    }
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        transform_op[channel].levels = levels[channel];
+        transform_op[channel].num_levels = num_levels[channel];
+
+        int bins = num_levels[channel] - 1;
+        h_histogram[channel] = new CounterT[bins];
+    }
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT*        d_samples = NULL;
+    LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
+    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
+        CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
+
+        int bins = num_levels[channel] - 1;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 9;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes);
+
+    // Check canary zones
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
+        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples,
+        d_histogram,
+        num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+
+        if (d_levels[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
+    }
+
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEven(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    LevelT lower_level[NUM_ACTIVE_CHANNELS];
+    LevelT upper_level[NUM_ACTIVE_CHANNELS];
+
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / static_cast<LevelT>(max_bins);
+
+    // Set upper and lower levels for each channel
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int num_bins = num_levels[channel] - 1;
+        lower_level[channel] = static_cast<LevelT>((max_level - (static_cast<LevelT>(num_bins) * min_level_increment)) / static_cast<LevelT>(2));
+        upper_level[channel] = static_cast<LevelT>((max_level + (static_cast<LevelT>(num_bins) * min_level_increment)) / static_cast<LevelT>(2));
+    }
+
+    // Test pointer-based samples
+    TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Test iterator-based samples (CUB-only)
+    TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        Int2Type<std::is_same<SampleT, half_t>::value>{}, max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+}
+
+
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / static_cast<LevelT>(max_bins);
+
+    LevelT* levels[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        levels[channel] = new LevelT[num_levels[channel]];
+
+        int num_bins = num_levels[channel] - 1;
+        LevelT lower_level = (max_level - static_cast<LevelT>(num_bins * min_level_increment)) / static_cast<LevelT>(2);
+
+        for (int level = 0; level < num_levels[channel]; ++level)
+            levels[channel][level] = lower_level + static_cast<LevelT>(level * min_level_increment);
+    }
+
+    TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+        delete[] levels[channel];
+
+}
+
+
+
+/**
+ * Test different entrypoints
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+
+    TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+/**
+ * Test different number of levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    int num_levels[NUM_ACTIVE_CHANNELS];
+
+    // All different levels
+    num_levels[0] = max_num_levels;
+    for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
+    }
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+
+/**
+ * Test different entropy-levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // entropy_reduction = -1 -> all samples == 0
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 0,  max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
+}
+
+
+/**
+ * Test different row strides
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
+
+    // No padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
+
+    // 13 samples padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
+}
+
+
+/**
+ * Test different problem sizes
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // 0 row/col images
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(0), max_level, max_num_levels);
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(0), OffsetT(0), max_level, max_num_levels);
+
+    // Small inputs
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+      OffsetT(15), OffsetT(1), max_level, max_num_levels);
+
+    // 1080 image
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
+
+    // Sample different aspect ratios sizes
+    for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
+    {
+        for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
+        {
+            Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+                cols, rows, max_level, max_num_levels);
+        }
+    }
+}
+
+
+
+/**
+ * Test different channel interleavings (valid specialiation)
+ */
+template <typename SampleT, typename CounterT, typename LevelT, typename OffsetT>
+void TestChannels(LevelT max_level,
+                  int max_num_levels,
+                  Int2Type<true> /*is_valid_tag*/,
+                  Int2Type<false> /*test_extra_channels*/)
+{
+  Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+  Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+}
+
+template <typename SampleT, typename CounterT, typename LevelT, typename OffsetT>
+void TestChannels(LevelT max_level,
+                  int max_num_levels,
+                  Int2Type<true> /*is_valid_tag*/,
+                  Int2Type<true> /*test_extra_channels*/)
+{
+  Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+  Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+  Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+  Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+}
+template <typename SampleT,
+          typename CounterT,
+          typename LevelT,
+          typename OffsetT,
+          typename TestExtraChannels>
+void TestChannels(LevelT /*max_level*/,
+                  int /*max_num_levels*/,
+                  Int2Type<false> /*is_valid_tag*/,
+                  TestExtraChannels)
+{}
+
+void TestLevelsAliasing()
+{
+  constexpr int num_levels = 7;
+
+  int h_histogram[num_levels - 1]{};
+  int h_samples[]{
+    0,  2,  4,  6,  8,  10, 12, // levels
+    1,                          // bin 0
+    3,  3,                      // bin 1
+    5,  5,  5,                  // bin 2
+    7,  7,  7,  7,              // bin 3
+    9,  9,  9,  9,  9,          // bin 4
+    11, 11, 11, 11, 11, 11      // bin 5
+  };
+
+  constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]);
+
+  int *d_histogram{};
+  int *d_samples{};
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram)));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples)));
+
+  CubDebugExit(
+    cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice));
+
+  // Alias levels with samples (fancy way to `d_histogram[bin]++`).
+  int *d_levels = d_samples;
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_samples,
+                                                    d_histogram,
+                                                    num_levels,
+                                                    d_levels,
+                                                    num_samples));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
+
+  CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_samples,
+                                                    d_histogram,
+                                                    num_levels,
+                                                    d_levels,
+                                                    num_samples));
+
+  CubDebugExit(cudaMemcpy(h_histogram,
+                          d_histogram,
+                          sizeof(h_histogram),
+                          cudaMemcpyDeviceToHost));
+
+  for (int bin = 0; bin < num_levels - 1; bin++)
+  {
+    // Each bin should contain `bin + 1` samples. Since samples also contain
+    // levels, they contribute one extra item to each bin.
+    AssertEquals(bin + 2, h_histogram[bin]);
+  }
+
+  CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  CubDebugExit(g_allocator.DeviceFree(d_histogram));
+  CubDebugExit(g_allocator.DeviceFree(d_levels));
+}
+
+// Regression test for NVIDIA/cub#489: integer rounding errors lead to incorrect
+// bin detection:
+void TestIntegerBinCalcs()
+{
+  constexpr int num_levels = 8;
+  constexpr int num_bins = num_levels - 1;
+
+  int h_histogram[num_bins]{};
+  const int h_histogram_ref[num_bins]{1, 5, 0, 2, 1, 0, 0};
+  const int h_samples[]{2, 6, 7, 2, 3, 0, 2, 2, 6, 999};
+  const int lower_level = 0;
+  const int upper_level = 12;
+
+  constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]);
+
+  int *d_histogram{};
+  int *d_samples{};
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram)));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples)));
+
+  CubDebugExit(
+    cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice));
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                   temp_storage_bytes,
+                                                   d_samples,
+                                                   d_histogram,
+                                                   num_levels,
+                                                   lower_level,
+                                                   upper_level,
+                                                   num_samples));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
+
+  CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_samples,
+                                                    d_histogram,
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    num_samples));
+
+  CubDebugExit(cudaMemcpy(h_histogram,
+                          d_histogram,
+                          sizeof(h_histogram),
+                          cudaMemcpyDeviceToHost));
+
+  for (int bin = 0; bin < num_bins; ++bin)
+  {
+    AssertEquals(h_histogram_ref[bin], h_histogram[bin]);
+  }
+
+  CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  CubDebugExit(g_allocator.DeviceFree(d_histogram));
+  CubDebugExit(g_allocator.DeviceFree(d_samples));
+}
+
+/**
+ * @brief Our bin computation for HistogramEven is guaranteed only for when (max_level - min_level)
+ * * num_bins does not overflow when using uint64_t arithmetic. In case bin computation could
+ * overflow, we expect cudaErrorInvalidValue to be returned.
+ */
+template<typename SampleT>
+void TestOverflow()
+{
+  using CounterT                   = uint32_t;
+  constexpr std::size_t test_cases = 2;
+
+  // Test data common across tests
+  SampleT lower_level = 0;
+  SampleT upper_level = ::cuda::std::numeric_limits<SampleT>::max();
+  thrust::counting_iterator<SampleT> d_samples{0UL};
+  thrust::device_vector<CounterT> d_histo_out(1024);
+  CounterT *d_histogram = thrust::raw_pointer_cast(d_histo_out.data());
+  int num_samples       = 1000;
+
+  // Prepare per-test specific data
+  constexpr std::size_t canary_bytes = 3;
+  std::array<std::size_t, test_cases> temp_storage_bytes{canary_bytes, canary_bytes};
+  std::array<int, test_cases> num_bins{1, 2};
+  // Since test #1 is just a single bin, we expect it to succeed
+  // Since we promote up to 64-bit integer arithmetic we expect tests to not overflow for types of
+  // up to 4 bytes. For 64-bit and wider types, we do not perform further promotion to even wider
+  // types, hence we expect cudaErrorInvalidValue to be returned to indicate of a potential overflow
+  std::array<cudaError_t, test_cases> expected_status{
+    cudaSuccess, 
+    sizeof(SampleT) <= 4UL ? cudaSuccess : cudaErrorInvalidValue};
+
+  // Verify we always initializes temp_storage_bytes
+  cudaError_t error{cudaSuccess};
+  for (std::size_t i = 0; i < test_cases; i++)
+  {
+    error = cub::DeviceHistogram::HistogramEven(nullptr,
+                                                temp_storage_bytes[i],
+                                                d_samples,
+                                                d_histogram,
+                                                num_bins[i] + 1,
+                                                lower_level,
+                                                upper_level,
+                                                num_samples);
+
+    // Ensure that temp_storage_bytes has been initialized even in the presence of error
+    AssertTrue(temp_storage_bytes[i] != canary_bytes);
+  }
+
+  // Allocate sufficient temporary storage
+  thrust::device_vector<std::uint8_t> temp_storage(
+    std::max(temp_storage_bytes[0], temp_storage_bytes[1]));
+
+  for (std::size_t i = 0; i < test_cases; i++)
+  {
+    error = cub::DeviceHistogram::HistogramEven(thrust::raw_pointer_cast(temp_storage.data()),
+                                                temp_storage_bytes[i],
+                                                d_samples,
+                                                d_histogram,
+                                                num_bins[i] + 1,
+                                                lower_level,
+                                                upper_level,
+                                                num_samples);
+
+    // Ensure we do not return an error on querying temporary storage requirements
+    AssertEquals(error, expected_status[i]);
+  }
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--i=<timing iterations>] "
+            "[--device=<device-id>] "
+            "[--v] "
+            "[--v2] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    TestOverflow<uint8_t>();
+    TestOverflow<uint16_t>();
+    TestOverflow<uint32_t>();
+    TestOverflow<uint64_t>();
+    using true_t = Int2Type<true>;
+    using false_t = Int2Type<false>;
+
+    TestLevelsAliasing();
+    TestIntegerBinCalcs(); // regression test for NVIDIA/cub#489
+
+#if TEST_HALF_T
+    TestChannels<half_t, int, half_t, int>(256, 256 + 1, true_t{}, true_t{});
+#endif
+
+    TestChannels <signed char,      int, int,   int>(256,   256 + 1,  true_t{}, true_t{});
+    TestChannels <unsigned short,   int, int,   int>(8192,  8192 + 1, true_t{}, false_t{});
+
+    // Make sure bin computation works fine when using int32 arithmetic
+    TestChannels <unsigned short,   int, unsigned short,   int>(std::numeric_limits<unsigned short>::max(),  std::numeric_limits<unsigned short>::max() + 1, true_t{}, false_t{});
+    // Make sure bin computation works fine when requiring int64 arithmetic
+    TestChannels <unsigned int,   int, unsigned int,   int>(std::numeric_limits<unsigned int>::max(),  8192 + 1, true_t{}, false_t{});
+#if !defined(__ICC)
+    // Fails with ICC for unknown reasons, see #332.
+    TestChannels <float,            int, float, int>(1.0,   256 + 1,  true_t{}, false_t{});
+#endif
+
+    // float samples, int levels, regression test for NVIDIA/cub#479.
+    TestChannels <float,            int, int,   int>(12,    7,        true_t{}, true_t{});
+
+    // Test down-conversion of size_t offsets to int
+    TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{});
+
+    return 0;
+}
diff --git a/include/cub/test/test_device_merge_sort.cu b/include/cub/test/test_device_merge_sort.cu
new file mode 100644
index 0000000..de7d9b8
--- /dev/null
+++ b/include/cub/test/test_device_merge_sort.cu
@@ -0,0 +1,362 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceMergeSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_merge_sort.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include "test_util.h"
+
+#include <cstdio>
+#include <limits>
+#include <new> // for std::bad_alloc
+#include <memory>
+#include <typeinfo>
+
+using namespace cub;
+
+struct CustomLess
+{
+  template <typename DataType>
+  __device__ bool operator()(DataType &lhs, DataType &rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+template <typename DataType>
+bool CheckResult(thrust::device_vector<DataType> &d_data)
+{
+  const bool is_sorted = thrust::is_sorted(d_data.begin(), d_data.end(), CustomLess());
+  return is_sorted;
+}
+
+template <typename KeyType, typename ValueType>
+struct ValueToKey
+{
+  __device__ __host__ KeyType operator()(const ValueType &val)
+  {
+    return val;
+  }
+};
+
+template <typename ValueType>
+struct ValueToKey<HugeDataType, ValueType>
+{
+  __device__ __host__ HugeDataType operator()(const ValueType &val)
+  {
+    return HugeDataType(val);
+  }
+};
+
+template <typename KeyType, typename DataType>
+void Test(std::int64_t num_items,
+          thrust::default_random_engine &rng,
+          thrust::device_vector<KeyType> &d_keys,
+          thrust::device_vector<DataType> &d_values)
+{
+  thrust::sequence(d_values.begin(), d_values.end());
+  thrust::shuffle(d_values.begin(), d_values.end(), rng);
+
+  thrust::transform(d_values.begin(),
+                    d_values.end(),
+                    d_keys.begin(),
+                    ValueToKey<KeyType, DataType>());
+
+  thrust::device_vector<KeyType> d_keys_before_sort(d_keys);
+  thrust::device_vector<DataType> d_values_before_sort(d_values);
+
+  thrust::device_vector<KeyType> d_keys_before_sort_copy(d_keys);
+  thrust::device_vector<DataType> d_values_before_sort_copy(d_values);
+
+  size_t temp_size = 0;
+  CubDebugExit(cub::DeviceMergeSort::SortPairs(
+      nullptr,
+      temp_size,
+      thrust::raw_pointer_cast(d_keys.data()),
+      thrust::raw_pointer_cast(d_values.data()),
+      num_items,
+      CustomLess()));
+
+  thrust::device_vector<char> tmp(temp_size);
+
+  CubDebugExit(cub::DeviceMergeSort::SortPairs(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess()));
+
+  thrust::device_vector<KeyType> d_keys_after_sort_copy(d_keys);
+  thrust::device_vector<DataType> d_values_after_sort_copy(d_values);
+
+  AssertTrue(CheckResult(d_values));
+
+  CubDebugExit(cub::DeviceMergeSort::SortPairsCopy(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys_before_sort.data()),
+    thrust::raw_pointer_cast(d_values_before_sort.data()),
+    thrust::raw_pointer_cast(d_keys.data()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess()));
+
+  AssertEquals(d_keys, d_keys_after_sort_copy);
+  AssertEquals(d_values, d_values_after_sort_copy);
+  AssertEquals(d_keys_before_sort, d_keys_before_sort_copy);
+  AssertEquals(d_values_before_sort, d_values_before_sort_copy);
+
+  // At the moment stable sort is an alias to sort, so it's safe to use
+  // temp_size storage allocated before
+  CubDebugExit(cub::DeviceMergeSort::StableSortPairs(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess()));
+
+  AssertTrue(CheckResult(d_values));
+
+  CubDebugExit(cub::DeviceMergeSort::SortPairsCopy(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::constant_iterator<KeyType>(KeyType(42)),
+    thrust::counting_iterator<DataType>(DataType(0)),
+    thrust::raw_pointer_cast(d_keys.data()),
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess()));
+
+  thrust::sequence(d_values_before_sort.begin(), d_values_before_sort.end());
+
+  AssertEquals(d_values, d_values_before_sort);
+}
+
+template <typename KeyType, typename DataType>
+void TestKeys(std::int64_t num_items,
+              thrust::default_random_engine &rng,
+              thrust::device_vector<KeyType> &d_keys,
+              thrust::device_vector<DataType> &d_values)
+{
+  thrust::sequence(d_values.begin(), d_values.end());
+  thrust::shuffle(d_values.begin(), d_values.end(), rng);
+
+  thrust::transform(d_values.begin(),
+                    d_values.end(),
+                    d_keys.begin(),
+                    ValueToKey<KeyType, DataType>());
+
+  thrust::device_vector<KeyType> d_before_sort(d_keys);
+  thrust::device_vector<KeyType> d_before_sort_copy(d_keys);
+
+  size_t temp_size = 0;
+  cub::DeviceMergeSort::SortKeys(
+    nullptr,
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    num_items,
+    CustomLess());
+
+  thrust::device_vector<char> tmp(temp_size);
+
+  CubDebugExit(cub::DeviceMergeSort::SortKeys(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    num_items,
+    CustomLess()));
+
+  thrust::device_vector<KeyType> d_after_sort(d_keys);
+
+  AssertTrue(CheckResult(d_keys));
+
+  CubDebugExit(cub::DeviceMergeSort::SortKeysCopy(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_before_sort.data()),
+    thrust::raw_pointer_cast(d_keys.data()),
+    num_items,
+    CustomLess()));
+
+  AssertTrue(d_keys == d_after_sort);
+  AssertTrue(d_before_sort == d_before_sort_copy);
+
+  // At the moment stable sort is an alias to sort, so it's safe to use
+  // temp_size storage allocated before
+  CubDebugExit(cub::DeviceMergeSort::StableSortKeys(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    thrust::raw_pointer_cast(d_keys.data()),
+    num_items,
+    CustomLess()));
+
+  AssertTrue(CheckResult(d_keys));
+
+  thrust::fill(d_keys.begin(), d_keys.end(), KeyType{});
+  CubDebugExit(cub::DeviceMergeSort::StableSortKeysCopy(
+      thrust::raw_pointer_cast(tmp.data()),
+      temp_size,
+      thrust::raw_pointer_cast(d_before_sort.data()),
+      thrust::raw_pointer_cast(d_keys.data()),
+      num_items,
+      CustomLess()));
+
+  // AssertTrue(CheckResult(d_keys));
+  AssertTrue(d_keys == d_after_sort);
+  AssertTrue(d_before_sort == d_before_sort_copy);
+}
+
+template <bool data_dont_exceed_key_size>
+struct TestHelper
+{
+  template <typename KeyType, typename DataType>
+  static void AllocateAndTest(thrust::default_random_engine &rng, unsigned int num_items)
+  {
+    thrust::device_vector<KeyType> d_keys(num_items);
+    thrust::device_vector<DataType> d_values(num_items);
+
+    Test<KeyType, DataType>(num_items, rng, d_keys, d_values);
+    TestKeys<KeyType, DataType>(num_items, rng, d_keys, d_values);
+  }
+};
+
+template <>
+struct TestHelper<false>
+{
+  template <typename, typename>
+  static void AllocateAndTest(thrust::default_random_engine &, unsigned int)
+  {}
+};
+
+template <typename DataType>
+void Test(thrust::default_random_engine &rng, unsigned int num_items)
+{
+  TestHelper<sizeof(DataType) <= sizeof(std::uint8_t) >::template AllocateAndTest<std::uint8_t,  DataType>(rng, num_items);
+  TestHelper<sizeof(DataType) <= sizeof(std::uint32_t)>::template AllocateAndTest<std::uint32_t, DataType>(rng, num_items);
+  TestHelper<sizeof(DataType) <= sizeof(std::uint64_t)>::template AllocateAndTest<std::uint64_t, DataType>(rng, num_items);
+}
+
+template <typename KeyType, typename DataType>
+void AllocateAndTestIterators(unsigned int num_items)
+{
+  thrust::device_vector<KeyType> d_keys(num_items);
+  thrust::device_vector<DataType> d_values(num_items);
+
+  thrust::sequence(d_keys.begin(), d_keys.end());
+  thrust::sequence(d_values.begin(), d_values.end());
+
+  thrust::reverse(d_values.begin(), d_values.end());
+
+  using KeyIterator = typename thrust::device_vector<KeyType>::iterator;
+  thrust::reverse_iterator<KeyIterator> reverse_iter(d_keys.end());
+
+  size_t temp_size = 0;
+  cub::DeviceMergeSort::SortPairs(
+    nullptr,
+    temp_size,
+    reverse_iter,
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess());
+
+  thrust::device_vector<char> tmp(temp_size);
+
+  cub::DeviceMergeSort::SortPairs(
+    thrust::raw_pointer_cast(tmp.data()),
+    temp_size,
+    reverse_iter,
+    thrust::raw_pointer_cast(d_values.data()),
+    num_items,
+    CustomLess());
+
+  AssertTrue(CheckResult(d_values));
+}
+
+template <typename DataType>
+void Test(thrust::default_random_engine &rng)
+{
+  for (unsigned int pow2 = 9; pow2 < 22; pow2 += 2)
+  {
+    try
+    {
+      const unsigned int num_items = 1 << pow2;
+      AllocateAndTestIterators<DataType, DataType>(num_items);
+
+      TestHelper<true>::AllocateAndTest<HugeDataType, DataType>(rng, num_items);
+      Test<DataType>(rng, num_items);
+    }
+    catch (std::bad_alloc &e)
+    {
+      if (pow2 > 20)
+      { // Some cards don't have enough memory for large allocations, these
+        // can be skipped.
+        printf("Skipping large memory test. (num_items=2^%u): %s\n",
+               pow2,
+               e.what());
+      }
+      else
+      { // For smaller problem sizes, treat as an error:
+        printf("Error (num_items=2^%u): %s", pow2, e.what());
+        throw;
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  thrust::default_random_engine rng;
+
+  Test<std::int32_t>(rng);
+  Test<std::int64_t>(rng);
+
+  return 0;
+}
diff --git a/include/cub/test/test_device_radix_sort.cu b/include/cub/test/test_device_radix_sort.cu
new file mode 100644
index 0000000..b70e957
--- /dev/null
+++ b/include/cub/test/test_device_radix_sort.cu
@@ -0,0 +1,2251 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceRadixSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <random>
+#include <type_traits>
+#include <typeinfo>
+#include <vector>
+
+#if !_NVHPC_CUDA
+    #include <cuda_fp16.h>
+#endif
+
+#if !_NVHPC_CUDA
+    #include <cuda_bf16.h>
+#endif
+
+#include <cub/detail/device_synchronize.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/mismatch.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose                       = false;
+int                     g_timing_iterations             = 0;
+std::size_t             g_smallest_pre_sorted_num_items = (std::size_t(1) << 32) - 42;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,                        // CUB method (allows overwriting of input)
+    CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)
+
+    CUB_SEGMENTED,              // CUB method (allows overwriting of input)
+    CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
+
+    // Same as above, but launches kernels from device using CDP.
+    CDP,
+    CDP_NO_OVERWRITE,
+    CDP_SEGMENTED,
+    CDP_SEGMENTED_NO_OVERWRITE,
+};
+
+static const char* BackendToString(Backend b)
+{
+  switch (b)
+  {
+    case CUB:
+      return "CUB";
+    case CUB_NO_OVERWRITE:
+      return "CUB_NO_OVERWRITE";
+    case CUB_SEGMENTED:
+      return "CUB_SEGMENTED";
+    case CUB_SEGMENTED_NO_OVERWRITE:
+      return "CUB_SEGMENTED_NO_OVERWRITE";
+    case CDP:
+      return "CDP";
+    case CDP_NO_OVERWRITE:
+      return "CDP_NO_OVERWRITE";
+    case CDP_SEGMENTED:
+      return "CDP_SEGMENTED";
+    case CDP_SEGMENTED_NO_OVERWRITE:
+      return "CDP_SEGMENTED_NO_OVERWRITE";
+    default:
+      break;
+  }
+
+  return "";
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<false>         /*is_descending*/,
+    Int2Type<CUB>           /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     /*num_segments*/,
+    BeginOffsetIteratorT    /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT      /*d_segment_end_offsets*/,
+    int                     begin_bit,
+    int                     end_bit)
+{
+    return DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, begin_bit, end_bit);
+}
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<false>             /*is_descending*/,
+    Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
+    int                         */*d_selector*/,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     /*num_segments*/,
+    BeginOffsetIteratorT    /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT      /*d_segment_end_offsets*/,
+    int                     begin_bit,
+    int                     end_bit)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<true>          /*is_descending*/,
+    Int2Type<CUB>           /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     /*num_segments*/,
+    BeginOffsetIteratorT    /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT      /*d_segment_end_offsets*/,
+    int                     begin_bit,
+    int                     end_bit)
+{
+    return DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values, num_items,
+        begin_bit, end_bit);
+}
+
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<true>              /*is_descending*/,
+    Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
+    int                         */*d_selector*/,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     /*num_segments*/,
+    BeginOffsetIteratorT    /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT      /*d_segment_end_offsets*/,
+    int                     begin_bit,
+    int                     end_bit)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+// Validates that `num_items` fits into `int`
+// TODO(canonizer): remove this check once num_items is templated for segmented sort.
+template <typename NumItemsT>
+__host__ __device__ bool ValidateNumItemsForSegmentedSort(NumItemsT num_items)
+{
+  if (static_cast<long long int>(num_items) <
+      static_cast<long long int>(INT_MAX))
+  {
+    return true;
+  }
+  else
+  {
+    printf("cub::DeviceSegmentedRadixSort is currently limited by %d items but "
+           "%lld were provided\n",
+           INT_MAX,
+           static_cast<long long int>(num_items));
+  }
+
+  return false;
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<false>         /*is_descending*/,
+    Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     num_segments,
+    BeginOffsetIteratorT    d_segment_begin_offsets,
+    EndOffsetIteratorT      d_segment_end_offsets,
+    int                     begin_bit,
+    int                     end_bit)
+{
+  if (ValidateNumItemsForSegmentedSort(num_items))
+  {
+    return DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
+                                               temp_storage_bytes,
+                                               d_keys,
+                                               d_values,
+                                               static_cast<int>(num_items),
+                                               num_segments,
+                                               d_segment_begin_offsets,
+                                               d_segment_end_offsets,
+                                               begin_bit,
+                                               end_bit);
+  }
+
+  return cudaErrorInvalidValue;
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<false>                         /*is_descending*/,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
+    int                                     */*d_selector*/,
+    size_t                                  */*d_temp_storage_bytes*/,
+    cudaError_t                             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     num_segments,
+    BeginOffsetIteratorT    d_segment_begin_offsets,
+    EndOffsetIteratorT      d_segment_end_offsets,
+    int                     begin_bit,
+    int                     end_bit)
+{
+  if (ValidateNumItemsForSegmentedSort(num_items))
+  {
+    KeyT const *const_keys_itr     = d_keys.Current();
+    ValueT const *const_values_itr = d_values.Current();
+
+    cudaError_t retval =
+      DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
+                                          temp_storage_bytes,
+                                          const_keys_itr,
+                                          d_keys.Alternate(),
+                                          const_values_itr,
+                                          d_values.Alternate(),
+                                          static_cast<int>(num_items),
+                                          num_segments,
+                                          d_segment_begin_offsets,
+                                          d_segment_end_offsets,
+                                          begin_bit,
+                                          end_bit);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+  }
+
+  return cudaErrorInvalidValue;
+}
+
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<true>          /*is_descending*/,
+    Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
+    int                     */*d_selector*/,
+    size_t                  */*d_temp_storage_bytes*/,
+    cudaError_t             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     num_segments,
+    BeginOffsetIteratorT    d_segment_begin_offsets,
+    EndOffsetIteratorT      d_segment_end_offsets,
+    int                     begin_bit,
+    int                     end_bit)
+{
+  if (ValidateNumItemsForSegmentedSort(num_items))
+  {
+    return DeviceSegmentedRadixSort::SortPairsDescending(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      static_cast<int>(num_items),
+      num_segments,
+      d_segment_begin_offsets,
+      d_segment_end_offsets,
+      begin_bit,
+      end_bit);
+  }
+
+  return cudaErrorInvalidValue;
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<true>                          /*is_descending*/,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
+    int                                     */*d_selector*/,
+    size_t                                  */*d_temp_storage_bytes*/,
+    cudaError_t                             */*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    NumItemsT               num_items,
+    int                     num_segments,
+    BeginOffsetIteratorT    d_segment_begin_offsets,
+    EndOffsetIteratorT      d_segment_end_offsets,
+    int                     begin_bit,
+    int                     end_bit)
+{
+  if (ValidateNumItemsForSegmentedSort(num_items))
+  {
+    KeyT const *const_keys_itr     = d_keys.Current();
+    ValueT const *const_values_itr = d_values.Current();
+
+    cudaError_t retval =
+      DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    const_keys_itr,
+                                                    d_keys.Alternate(),
+                                                    const_values_itr,
+                                                    d_values.Alternate(),
+                                                    static_cast<int>(num_items),
+                                                    num_segments,
+                                                    d_segment_begin_offsets,
+                                                    d_segment_end_offsets,
+                                                    begin_bit,
+                                                    end_bit);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+  }
+
+  return cudaErrorInvalidValue;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceRadixSort
+ */
+template <int IsDescending,
+          int CubBackend,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+__global__ void CDPDispatchKernel(Int2Type<IsDescending> is_descending,
+                                  Int2Type<CubBackend>   cub_backend,
+                                  int                   *d_selector,
+                                  size_t                *d_temp_storage_bytes,
+                                  cudaError_t           *d_cdp_error,
+                                  void                  *d_temp_storage,
+                                  size_t                 temp_storage_bytes,
+                                  DoubleBuffer<KeyT>     d_keys,
+                                  DoubleBuffer<ValueT>   d_values,
+                                  NumItemsT              num_items,
+                                  int                    num_segments,
+                                  BeginOffsetIteratorT d_segment_begin_offsets,
+                                  EndOffsetIteratorT   d_segment_end_offsets,
+                                  int                  begin_bit,
+                                  int                  end_bit)
+{
+  *d_cdp_error = Dispatch(is_descending,
+                          cub_backend,
+                          d_selector,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys,
+                          d_values,
+                          num_items,
+                          num_segments,
+                          d_segment_begin_offsets,
+                          d_segment_end_offsets,
+                          begin_bit,
+                          end_bit);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+  *d_selector           = d_keys.selector;
+}
+
+/**
+ * Launch kernel and dispatch on device. Should only be called from host code.
+ * The CubBackend should be one of the non-CDP CUB backends to invoke from the
+ * device.
+ */
+template <int IsDescending,
+          int CubBackend,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+cudaError_t LaunchCDPKernel(Int2Type<IsDescending> is_descending,
+                            Int2Type<CubBackend>   cub_backend,
+                            int                   *d_selector,
+                            size_t                *d_temp_storage_bytes,
+                            cudaError_t           *d_cdp_error,
+
+                            void                 *d_temp_storage,
+                            size_t               &temp_storage_bytes,
+                            DoubleBuffer<KeyT>   &d_keys,
+                            DoubleBuffer<ValueT> &d_values,
+                            NumItemsT             num_items,
+                            int                   num_segments,
+                            BeginOffsetIteratorT  d_segment_begin_offsets,
+                            EndOffsetIteratorT    d_segment_end_offsets,
+                            int                   begin_bit,
+                            int                   end_bit)
+{
+  // Invoke kernel to invoke device-side dispatch:
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<IsDescending,
+                              CubBackend,
+                              KeyT,
+                              ValueT,
+                              BeginOffsetIteratorT,
+                              EndOffsetIteratorT,
+                              NumItemsT>,
+            is_descending,
+            cub_backend,
+            d_selector,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_segment_begin_offsets,
+            d_segment_end_offsets,
+            begin_bit,
+            end_bit);
+  CubDebugExit(retval);
+  CubDebugExit(cub::detail::device_synchronize());
+
+  // Copy out selector
+  CubDebugExit(cudaMemcpy(&d_keys.selector,
+                          d_selector,
+                          sizeof(int) * 1,
+                          cudaMemcpyDeviceToHost));
+  d_values.selector = d_keys.selector;
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  return retval;
+}
+
+// Specializations of Dispatch that translate the CDP backend to the appropriate
+// CUB backend, and uses the CUB backend to launch the CDP kernel.
+#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend)                          \
+  template <int IsDescending,                                                  \
+            typename KeyT,                                                     \
+            typename ValueT,                                                   \
+            typename BeginOffsetIteratorT,                                     \
+            typename EndOffsetIteratorT,                                       \
+            typename NumItemsT>                                                \
+  cudaError_t Dispatch(Int2Type<IsDescending> is_descending,                   \
+                       Int2Type<CdpBackend> /*dispatch_to*/,                   \
+                       int         *d_selector,                                \
+                       size_t      *d_temp_storage_bytes,                      \
+                       cudaError_t *d_cdp_error,                               \
+                                                                               \
+                       void                 *d_temp_storage,                   \
+                       size_t               &temp_storage_bytes,               \
+                       DoubleBuffer<KeyT>   &d_keys,                           \
+                       DoubleBuffer<ValueT> &d_values,                         \
+                       NumItemsT             num_items,                        \
+                       int                   num_segments,                     \
+                       BeginOffsetIteratorT  d_segment_begin_offsets,          \
+                       EndOffsetIteratorT    d_segment_end_offsets,            \
+                       int                   begin_bit,                        \
+                       int                   end_bit)                          \
+  {                                                                            \
+    Int2Type<CubBackend> cub_backend{};                                        \
+    return LaunchCDPKernel(is_descending,                                      \
+                           cub_backend,                                        \
+                           d_selector,                                         \
+                           d_temp_storage_bytes,                               \
+                           d_cdp_error,                                        \
+                           d_temp_storage,                                     \
+                           temp_storage_bytes,                                 \
+                           d_keys,                                             \
+                           d_values,                                           \
+                           num_items,                                          \
+                           num_segments,                                       \
+                           d_segment_begin_offsets,                            \
+                           d_segment_end_offsets,                              \
+                           begin_bit,                                          \
+                           end_bit);                                           \
+  }
+
+DEFINE_CDP_DISPATCHER(CDP, CUB)
+DEFINE_CDP_DISPATCHER(CDP_NO_OVERWRITE, CUB_NO_OVERWRITE)
+DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)
+DEFINE_CDP_DISPATCHER(CDP_SEGMENTED_NO_OVERWRITE, CUB_SEGMENTED_NO_OVERWRITE)
+
+#undef DEFINE_CDP_DISPATCHER
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple key-value pairing
+ */
+template <
+    typename KeyT,
+    typename ValueT>
+struct Pair
+{
+    KeyT     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+
+/**
+ * Simple key-value pairing (specialized for bool types)
+ */
+template <typename ValueT>
+struct Pair<bool, ValueT>
+{
+    bool     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (!key && b.key);
+    }
+};
+
+
+/**
+ * Initialize key data
+ */
+template <typename KeyT, typename NumItemsT>
+void InitializeKeyBits(
+    GenMode         gen_mode,
+    KeyT            *h_keys,
+    NumItemsT       num_items,
+    int             /*entropy_reduction*/)
+{
+    for (NumItemsT i = 0; i < num_items; ++i)
+        InitValue(gen_mode, h_keys[i], i);
+}
+
+template <typename KeyT,
+          typename UnsignedBits = typename cub::Traits<KeyT>::UnsignedBits>
+UnsignedBits KeyBits(KeyT key)
+{
+    UnsignedBits bits;
+    memcpy(&bits, &key, sizeof(KeyT));
+    return bits;
+}
+
+/** Initialize the reference array monotonically. */
+template <typename KeyT, typename NumItemsT>
+void InitializeKeysSorted(
+    KeyT            *h_keys,
+    NumItemsT       num_items)
+{
+    using TraitsT = cub::Traits<KeyT>;
+    using UnsignedBits = typename TraitsT::UnsignedBits;
+
+    // Numbers to generate random runs.
+    UnsignedBits max_inc = 1 << (sizeof(UnsignedBits) < 4 ? 3 :
+                                 (sizeof(UnsignedBits) < 8 ? 14 : 24));
+    UnsignedBits min_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Lowest()));
+    UnsignedBits max_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Max()));
+    NumItemsT max_run = std::max(
+        NumItemsT(double(num_items) * (max_inc + 1) / max_bits),
+        NumItemsT(1 << 14));
+
+    UnsignedBits *h_key_bits = reinterpret_cast<UnsignedBits*>(h_keys);
+    NumItemsT i = 0;
+    // Start with the minimum twiddled key.
+    UnsignedBits twiddled_key = min_bits;
+    while (i < num_items)
+    {
+        // Generate random increment (avoid overflow).
+        UnsignedBits inc_bits = 0;
+        RandomBits(inc_bits);
+        // twiddled_key < max_bits at this point.
+        UnsignedBits inc = static_cast<UnsignedBits>(std::min(1 + inc_bits % max_inc, max_bits - twiddled_key));
+        twiddled_key += inc;
+
+        // Generate random run length (ensure there are enough values to fill the rest).
+        NumItemsT run_bits = 0;
+        RandomBits(run_bits);
+        NumItemsT run_length = std::min(1 + run_bits % max_run, num_items - i);
+        if (twiddled_key == max_bits) run_length = num_items - i;
+        NumItemsT run_end = i + run_length;
+
+        // Fill the array.
+        UnsignedBits key = TraitsT::TwiddleOut(twiddled_key);
+        // Avoid -0.0 for floating-point keys.
+        UnsignedBits negative_zero = UnsignedBits(1) << UnsignedBits(sizeof(UnsignedBits) * 8 - 1);
+        if (TraitsT::CATEGORY == cub::FLOATING_POINT && key == negative_zero)
+        {
+            key = 0;
+        }
+
+        for (; i < run_end; ++i)
+        {
+            h_key_bits[i] = key;
+        }
+    }
+}
+
+
+/**
+ * Initialize solution
+ */
+template <bool IS_DESCENDING, bool WANT_RANKS, typename KeyT, typename NumItemsT>
+void InitializeSolution(
+    KeyT       *h_keys,
+    NumItemsT  num_items,
+    int        num_segments,
+    bool       pre_sorted,
+    NumItemsT  *h_segment_offsets,
+    int        begin_bit,
+    int        end_bit,
+    NumItemsT  *&h_reference_ranks,
+    KeyT       *&h_reference_keys)
+{
+    if (num_items == 0)
+    {
+        h_reference_ranks = nullptr;
+        h_reference_keys = nullptr;
+        return;
+    }
+
+    if (pre_sorted)
+    {
+        printf("Shuffling reference solution on CPU\n");
+        // Note: begin_bit and end_bit are ignored here, and assumed to have the
+        // default values (begin_bit == 0, end_bit == 8 * sizeof(KeyT)).
+        // Otherwise, pre-sorting won't work, as it doesn't necessarily
+        // correspond to the order of keys sorted by a subrange of bits.
+        // num_segments is also ignored as assumed to be 1, as pre-sorted tests
+        // are currently not supported for multiple segments.
+        //
+        // Pre-sorted tests with non-default begin_bit, end_bit or num_segments
+        // != 1 are skipped in TestBits() and TestSegments(), respectively.
+        AssertEquals(begin_bit, 0);
+        AssertEquals(end_bit, static_cast<int>(8 * sizeof(KeyT)));
+        AssertEquals(num_segments, 1);
+
+        // Copy to the reference solution.
+        h_reference_keys = new KeyT[num_items];
+        if (IS_DESCENDING)
+        {
+            // Copy in reverse.
+            for (NumItemsT i = 0; i < num_items; ++i)
+            {
+                h_reference_keys[i] = h_keys[num_items - 1 - i];
+            }
+            // Copy back.
+            memcpy(h_keys, h_reference_keys, num_items * sizeof(KeyT));
+        }
+        else
+        {
+            memcpy(h_reference_keys, h_keys, num_items * sizeof(KeyT));
+        }
+
+        // Summarize the pre-sorted array (element, 1st position, count).
+        struct Element
+        {
+            KeyT key;
+            NumItemsT num;
+            NumItemsT index;
+        };
+
+        std::vector<Element> summary;
+        KeyT cur_key = h_reference_keys[0];
+        summary.push_back(Element{cur_key, 1, 0});
+        for (NumItemsT i = 1; i < num_items; ++i)
+        {
+            KeyT key = h_reference_keys[i];
+            if (key == cur_key)
+            {
+                // Same key.
+                summary.back().num++;
+                continue;
+            }
+
+            // Different key.
+            cur_key = key;
+            summary.push_back(Element{cur_key, 1, i});
+        }
+
+        // Generate a random permutation from the summary. Such a complicated
+        // approach is used to permute the array and compute ranks in a
+        // cache-friendly way and in a short time.
+        if (WANT_RANKS)
+        {
+            h_reference_ranks = new NumItemsT[num_items];
+        }
+        NumItemsT max_run = 32;
+        NumItemsT run = 0;
+        NumItemsT i = 0;
+        while (summary.size() > 0)
+        {
+            // Pick up a random element and a run.
+            NumItemsT bits = 0;
+            RandomBits(bits);
+            NumItemsT summary_id = bits % summary.size();
+            Element& element = summary[summary_id];
+            run = std::min(1 + bits % (max_run - 1), element.num);
+            for (NumItemsT j = 0; j < run; ++j)
+            {
+                h_keys[i + j] = element.key;
+                if (WANT_RANKS)
+                {
+                    h_reference_ranks[element.index + j] = i + j;
+                }
+            }
+            i += run;
+            element.index += run;
+            element.num -= run;
+            if (element.num == 0)
+            {
+                // Remove the empty entry.
+                std::swap(summary[summary_id], summary.back());
+                summary.pop_back();
+            }
+        }
+        printf(" Done.\n");
+    }
+    else
+    {
+        typedef Pair<KeyT, NumItemsT> PairT;
+
+        PairT *h_pairs = new PairT[num_items];
+
+        int num_bits = end_bit - begin_bit;
+        for (NumItemsT i = 0; i < num_items; ++i)
+        {
+
+            // Mask off unwanted portions
+            if (num_bits < static_cast<int>(sizeof(KeyT) * 8))
+            {
+                using UnsignedBits = typename cub::Traits<KeyT>::UnsignedBits;
+
+                UnsignedBits base = 0;
+                memcpy(&base, &h_keys[i], sizeof(KeyT));
+                base &= ((UnsignedBits{1} << num_bits) - 1) << begin_bit;
+                memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
+            }
+            else
+            {
+                h_pairs[i].key = h_keys[i];
+            }
+
+            h_pairs[i].value = i;
+        }
+
+        printf("\nSorting reference solution on CPU "
+               "(%zd items, %d segments, %zd items/seg)...",
+               static_cast<std::size_t>(num_items),
+               num_segments,
+               static_cast<std::size_t>(num_items / num_segments));
+        fflush(stdout);
+
+        for (int i = 0; i < num_segments; ++i)
+        {
+            if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+            std::stable_sort(               h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+            if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+        }
+
+        printf(" Done.\n"); fflush(stdout);
+
+        if (WANT_RANKS)
+        {
+            h_reference_ranks  = new NumItemsT[num_items];
+        }
+        h_reference_keys   = new KeyT[num_items];
+
+        for (NumItemsT i = 0; i < num_items; ++i)
+        {
+            if (WANT_RANKS)
+            {
+                h_reference_ranks[i]    = h_pairs[i].value;
+            }
+            h_reference_keys[i]     = h_keys[h_pairs[i].value];
+        }
+
+        if (h_pairs) delete[] h_pairs;
+    }
+}
+
+template <bool IS_DESCENDING, typename KeyT, typename NumItemsT>
+void ResetKeys(KeyT *h_keys, NumItemsT num_items, bool pre_sorted, KeyT *reference_keys)
+{
+    if (!pre_sorted) return;
+
+    // Copy the reference keys back.
+    if (IS_DESCENDING)
+    {
+        // Keys need to be copied in reverse.
+        for (NumItemsT i = 0; i < num_items; ++i)
+        {
+            h_keys[i] = reference_keys[num_items - 1 - i];
+        }
+    }
+    else
+    {
+        memcpy(h_keys, reference_keys, num_items * sizeof(KeyT));
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+template <typename T>
+struct UnwrapHalfAndBfloat16 {
+    using Type = T;
+};
+
+#if !_NVHPC_CUDA
+template <>
+struct UnwrapHalfAndBfloat16<half_t> {
+    using Type = __half;
+};
+#endif
+
+#if !_NVHPC_CUDA
+template <>
+struct UnwrapHalfAndBfloat16<bfloat16_t> {
+    using Type = __nv_bfloat16;
+};
+#endif
+
+template <class T, class NumItemsT>
+int compare_device_arrays(T* host_reference, T* d_tmp_buffer, T* d_data, NumItemsT num_items)
+{
+    CubDebugExit(cudaMemcpy(d_tmp_buffer, host_reference, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    auto d_reference_begin = d_tmp_buffer;
+    auto d_reference_end = d_tmp_buffer + num_items;
+    auto err = thrust::mismatch(thrust::device, d_reference_begin, d_reference_end, d_data);
+
+    if (err.first != d_reference_end)
+    {
+        const auto index = thrust::distance(d_reference_begin, err.first);
+        return CompareDeviceResults(host_reference + index, d_data + index, 1, true, g_verbose);
+    }
+
+    return 0;
+}
+
+template <class NumItemsT>
+int compare_device_arrays(
+    CUB_NS_QUALIFIER::NullType */* h_reference */,
+    CUB_NS_QUALIFIER::NullType */* d_tmp_buffer */,
+    CUB_NS_QUALIFIER::NullType */* d_data */,
+    NumItemsT /* num_items */)
+{
+    return 0;
+}
+
+/**
+ * Test DeviceRadixSort
+ */
+template <
+    Backend     BACKEND,
+    bool        IS_DESCENDING,
+    typename    KeyT,
+    typename    ValueT,
+    typename    BeginOffsetIteratorT,
+    typename    EndOffsetIteratorT,
+    typename    NumItemsT>
+void Test(
+    KeyT                 *h_keys,
+    ValueT               *h_values,
+    NumItemsT            num_items,
+    int                  num_segments,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT   d_segment_end_offsets,
+    int                  begin_bit,
+    int                  end_bit,
+    KeyT                 *h_reference_keys,
+    ValueT               *h_reference_values)
+{
+    // Key alias type
+    using KeyAliasT = typename UnwrapHalfAndBfloat16<KeyT>::Type;
+
+    const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+    printf("%s %s cub::DeviceRadixSort %zd items, %d segments, "
+           "%d-byte keys (%s) %d-byte values (%s), %d-byte num_items (%s), "
+           "descending %d, begin_bit %d, end_bit %d\n",
+           BackendToString(BACKEND),
+           (KEYS_ONLY) ? "keys-only" : "key-value",
+           static_cast<std::size_t>(num_items),
+           num_segments,
+           static_cast<int>(sizeof(KeyT)),
+           typeid(KeyT).name(),
+           (KEYS_ONLY) ? 0 : static_cast<int>(sizeof(ValueT)),
+           typeid(ValueT).name(),
+           static_cast<int>(sizeof(NumItemsT)),
+           typeid(NumItemsT).name(),
+           IS_DESCENDING,
+           begin_bit,
+           end_bit);
+
+    if (g_verbose)
+    {
+        printf("Input keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+    }
+
+    // Allocate device arrays
+    DoubleBuffer<KeyAliasT> d_keys;
+    DoubleBuffer<ValueT>    d_values;
+    int                     *d_selector;
+    size_t                  *d_temp_storage_bytes;
+    cudaError_t             *d_cdp_error;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
+    if (!KEYS_ONLY)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
+    }
+
+    // Allocate temporary storage (and make it un-aligned)
+    size_t  temp_storage_bytes  = 0;
+    void    *d_temp_storage     = NULL;
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
+        begin_bit, end_bit));
+
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
+    void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
+
+    // Initialize/clear device arrays
+    d_keys.selector = 0;
+    CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
+    if (!KEYS_ONLY)
+    {
+        d_values.selector = 0;
+        CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
+    }
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
+        begin_bit, end_bit));
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    printf("Warmup done.  Checking results:\n"); fflush(stdout);
+
+    int compare = 0;
+
+    // If in/out API is used, we are not allowed to overwrite the input. 
+    // Let's check that the input buffer is not overwritten by the algorithm.
+    if (BACKEND == CUB_NO_OVERWRITE)
+    {
+        KeyT *d_input_keys = reinterpret_cast<KeyT*>(d_keys.d_buffers[0]);
+
+        if (temp_storage_bytes < sizeof(KeyT) * num_items)
+        {
+            // For small input sizes, temporary storage is not large enough to fit keys.
+            compare = CompareDeviceResults(h_keys, d_input_keys, num_items, true, g_verbose);
+        }
+        else 
+        {
+            // If overwrite is not allowed, temporary storage is large enough to fit keys.
+            KeyT* temp_keys = reinterpret_cast<KeyT*>(d_temp_storage);
+
+            compare = compare_device_arrays(h_keys, temp_keys, d_input_keys, num_items);
+            printf("\t Compare input keys: %s ", compare ? "FAIL" : "PASS"); fflush(stdout);
+        }
+    }
+
+    // After the previous check is done, we can safely reuse alternative buffer to store 
+    // the reference results and compare current output.
+    compare |= compare_device_arrays(h_reference_keys,
+                                     reinterpret_cast<KeyT *>(d_keys.Alternate()),
+                                     reinterpret_cast<KeyT *>(d_keys.Current()),
+                                     num_items);
+
+    printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
+
+    if (!KEYS_ONLY)
+    {
+        int values_compare = compare_device_arrays(h_reference_values,
+                                                   d_values.Alternate(),
+                                                   d_values.Current(),
+                                                   num_items);
+
+        compare |= values_compare;
+        printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Performance
+    if (g_timing_iterations)
+        printf("\nPerforming timing iterations:\n"); fflush(stdout);
+
+    GpuTimer gpu_timer;
+    float elapsed_millis = 0.0f;
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Initialize/clear device arrays
+        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
+        if (!KEYS_ONLY)
+        {
+            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
+        }
+
+        gpu_timer.Start();
+        CubDebugExit(Dispatch(
+            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+            num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
+            begin_bit, end_bit));
+        gpu_timer.Stop();
+        elapsed_millis += gpu_timer.ElapsedMillis();
+    }
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = (KEYS_ONLY) ?
+            giga_rate * sizeof(KeyT) * 2 :
+            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
+        printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+// Returns whether there is enough memory for the test.
+template <typename KeyT, typename ValueT>
+bool HasEnoughMemory(std::size_t num_items, bool overwrite)
+{
+    std::size_t total_mem = TotalGlobalMem();
+    std::size_t value_size = std::is_same<ValueT, NullType>::value
+                           ? 0
+                           : sizeof(ValueT);
+    // A conservative estimate of the amount of memory required.
+    double factor = overwrite ? 2.25 : 3.25;
+    std::size_t test_mem = static_cast<std::size_t>
+      (num_items * (sizeof(KeyT) + value_size) * factor);
+    return test_mem < total_mem;
+}
+
+
+/**
+ * Test backend
+ */
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+void TestBackend(KeyT                *h_keys,
+                 NumItemsT            num_items,
+                 int                  num_segments,
+                 BeginOffsetIteratorT d_segment_begin_offsets,
+                 EndOffsetIteratorT   d_segment_end_offsets,
+                 int                  begin_bit,
+                 int                  end_bit,
+                 KeyT                *h_reference_keys,
+                 NumItemsT           *h_reference_ranks)
+{
+#if TEST_CDP == 0
+  constexpr auto NonSegmentedOverwrite   = CUB;
+  constexpr auto NonSegmentedNoOverwrite = CUB_NO_OVERWRITE;
+  constexpr auto SegmentedOverwrite      = CUB_SEGMENTED;
+  constexpr auto SegmentedNoOverwrite    = CUB_SEGMENTED_NO_OVERWRITE;
+#else  // TEST_CDP
+  constexpr auto NonSegmentedOverwrite   = CDP;
+  constexpr auto NonSegmentedNoOverwrite = CDP_NO_OVERWRITE;
+  constexpr auto SegmentedOverwrite      = CDP_SEGMENTED;
+  constexpr auto SegmentedNoOverwrite    = CDP_SEGMENTED_NO_OVERWRITE;
+#endif // TEST_CDP
+
+  const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  // A conservative check assuming overwrite is allowed.
+  if (!HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items), true))
+  {
+    printf("Skipping the test due to insufficient device memory\n");
+    return;
+  }
+
+  std::unique_ptr<ValueT[]> h_value_data{};
+
+  ValueT *h_values           = nullptr;
+  ValueT *h_reference_values = nullptr;
+
+  if (!KEYS_ONLY)
+  {
+    h_value_data.reset(new ValueT[2 * static_cast<std::size_t>(num_items)]);
+    h_values           = h_value_data.get();
+    h_reference_values = h_value_data.get() + num_items;
+
+    for (NumItemsT i = 0; i < num_items; ++i)
+    {
+      InitValue(INTEGER_SEED, h_values[i], i);
+      InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+    }
+  }
+
+  // Skip segmented sort if num_items isn't int.
+  // TODO(64bit-seg-sort): re-enable these tests once num_items is templated for
+  // segmented sort.
+  if (std::is_same<NumItemsT, int>::value)
+  {
+    printf("Testing segmented sort with overwrite\n");
+    Test<SegmentedOverwrite, IS_DESCENDING>(h_keys,
+                                            h_values,
+                                            num_items,
+                                            num_segments,
+                                            d_segment_begin_offsets,
+                                            d_segment_end_offsets,
+                                            begin_bit,
+                                            end_bit,
+                                            h_reference_keys,
+                                            h_reference_values);
+    printf("Testing segmented sort with no overwrite\n");
+    Test<SegmentedNoOverwrite, IS_DESCENDING>(h_keys,
+                                              h_values,
+                                              num_items,
+                                              num_segments,
+                                              d_segment_begin_offsets,
+                                              d_segment_end_offsets,
+                                              begin_bit,
+                                              end_bit,
+                                              h_reference_keys,
+                                              h_reference_values);
+  }
+  else
+  {
+    printf("Skipping segmented sort tests (NumItemsT != int)\n");
+  }
+
+  if (num_segments == 1)
+  {
+    printf("Testing non-segmented sort with overwrite\n");
+    Test<NonSegmentedOverwrite, IS_DESCENDING>(h_keys,
+                                               h_values,
+                                               num_items,
+                                               num_segments,
+                                               d_segment_begin_offsets,
+                                               d_segment_end_offsets,
+                                               begin_bit,
+                                               end_bit,
+                                               h_reference_keys,
+                                               h_reference_values);
+    if (HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items),
+                                      false))
+    {
+      printf("Testing non-segmented sort with no overwrite\n");
+      Test<NonSegmentedNoOverwrite, IS_DESCENDING>(h_keys,
+                                                   h_values,
+                                                   num_items,
+                                                   num_segments,
+                                                   d_segment_begin_offsets,
+                                                   d_segment_end_offsets,
+                                                   begin_bit,
+                                                   end_bit,
+                                                   h_reference_keys,
+                                                   h_reference_values);
+    }
+    else
+    {
+      printf("Skipping no-overwrite tests with %zd items due to "
+             "insufficient memory\n",
+             static_cast<std::size_t>(num_items));
+    }
+  }
+}
+
+
+// Smallest value type for TEST_VALUE_TYPE.
+// Unless TEST_VALUE_TYPE == 3, this is the only value type tested.
+#if TEST_VALUE_TYPE == 0
+// Test keys-only
+using SmallestValueT = NullType;
+#elif TEST_VALUE_TYPE == 1
+// Test with 8b value
+using SmallestValueT = unsigned char;
+#elif TEST_VALUE_TYPE == 2
+// Test with 32b value
+using SmallestValueT = unsigned int;
+// Test with 64b value
+#elif TEST_VALUE_TYPE == 3
+using SmallestValueT = unsigned long long;
+#endif
+
+
+/**
+ * Test value type
+ */
+template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
+void TestValueTypes(
+    KeyT                 *h_keys,
+    NumItemsT            num_items,
+    int                  num_segments,
+    bool                 pre_sorted,
+    NumItemsT            *h_segment_offsets,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT   d_segment_end_offsets,
+    int                  begin_bit,
+    int                  end_bit)
+{
+    // Initialize the solution
+    NumItemsT *h_reference_ranks = NULL;
+    KeyT *h_reference_keys = NULL;
+    // If TEST_VALUE_TYPE == 0, no values are sorted, only keys.
+    // Since ranks are only necessary when checking for values,
+    // they are not computed in this case.
+    InitializeSolution<IS_DESCENDING, TEST_VALUE_TYPE != 0>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    TestBackend<IS_DESCENDING, KeyT, SmallestValueT>          (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+#if TEST_VALUE_TYPE == 3
+    // Test with non-trivially-constructable value
+    // These are cheap to build, so lump them in with the 64b value tests.
+    TestBackend<IS_DESCENDING, KeyT, TestBar>           (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+#endif
+
+    // Cleanup
+    ResetKeys<IS_DESCENDING>(h_keys, num_items, pre_sorted, h_reference_keys);
+    if (h_reference_ranks) delete[] h_reference_ranks;
+    if (h_reference_keys) delete[] h_reference_keys;
+}
+
+
+
+/**
+ * Test ascending/descending
+ */
+template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+void TestDirection(
+    KeyT                 *h_keys,
+    NumItemsT            num_items,
+    int                  num_segments,
+    bool                 pre_sorted,
+    NumItemsT            *h_segment_offsets,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT   d_segment_end_offsets,
+    int                  begin_bit,
+    int                  end_bit)
+{
+    TestValueTypes<true>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
+    TestValueTypes<false>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
+}
+
+
+/**
+ * Test different bit ranges
+ */
+template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
+          typename NumItemsT>
+void TestBits(
+    KeyT                 *h_keys,
+    NumItemsT            num_items,
+    int                  num_segments,
+    bool                 pre_sorted,
+    NumItemsT            *h_segment_offsets,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT   d_segment_end_offsets)
+{
+    // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way) or pre-sorted keys
+    if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER)
+        && (!std::is_same<KeyT, bool>::value)
+        && !pre_sorted)
+    {
+        // Partial bits
+        int begin_bit = 1;
+        int end_bit = (sizeof(KeyT) * 8) - 1;
+        printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
+
+        // Equal bits
+        begin_bit = end_bit = 0;
+        printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
+
+        // Across subword boundaries
+        int mid_bit = sizeof(KeyT) * 4;
+        printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, mid_bit - 1, mid_bit + 1);
+    }
+
+    printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
+    TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, 0, sizeof(KeyT) * 8);
+}
+
+
+template<typename OffsetT>
+struct TransformFunctor1
+{
+    __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
+    {
+        return offset;
+    }
+};
+
+template<typename OffsetT>
+struct TransformFunctor2
+{
+    __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
+    {
+        return offset;
+    }
+};
+
+
+/**
+* Test different segment iterators
+*/
+template <typename KeyT, typename NumItemsT>
+void TestSegmentIterators(
+    KeyT           *h_keys,
+    NumItemsT      num_items,
+    int            num_segments,
+    bool           pre_sorted,
+    NumItemsT     *h_segment_offsets,
+    NumItemsT     *d_segment_offsets)
+{
+    InitializeSegments(num_items, num_segments, h_segment_offsets);
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(NumItemsT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    // Test with segment pointer.
+    // This is also used to test non-segmented sort.
+    TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets, d_segment_offsets + 1);
+
+    if (num_segments > 1)
+    {
+        // Test with transform iterators of different types
+        typedef TransformFunctor1<NumItemsT> TransformFunctor1T;
+        typedef TransformFunctor2<NumItemsT> TransformFunctor2T;
+
+        TransformInputIterator<NumItemsT, TransformFunctor1T, NumItemsT*, NumItemsT> d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T());
+        TransformInputIterator<NumItemsT, TransformFunctor2T, NumItemsT*, NumItemsT> d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T());
+
+        TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets_itr, d_segment_end_offsets_itr);
+    }
+}
+
+
+/**
+ * Test different segment compositions
+ */
+template <typename KeyT, typename NumItemsT>
+void TestSegments(
+    KeyT         *h_keys,
+    NumItemsT    num_items,
+    int          max_segments,
+    bool         pre_sorted)
+{
+    max_segments = static_cast<int>(CUB_MIN(num_items, static_cast<NumItemsT>(max_segments)));
+    NumItemsT *h_segment_offsets = new NumItemsT[max_segments + 1];
+
+    NumItemsT *d_segment_offsets = nullptr;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(NumItemsT) * (max_segments + 1)));
+
+    for (int num_segments = max_segments; num_segments > 1; num_segments = cub::DivideAndRoundUp(num_segments, 64))
+    {
+        // Pre-sorted tests are not supported for segmented sort
+        if (num_items / num_segments < 128 * 1000 && !pre_sorted) {
+            // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+            TestSegmentIterators(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets);
+        }
+    }
+
+    // Test single segment
+    if (num_items > 0)
+    {
+      if (num_items < 128 * 1000 || pre_sorted)
+      {
+        // Right now we assign a single thread block to each segment, so lets
+        // keep it to under 128K items per segment
+        TestSegmentIterators(h_keys,
+                             num_items,
+                             1,
+                             pre_sorted,
+                             h_segment_offsets,
+                             d_segment_offsets);
+      }
+    }
+
+    if (h_segment_offsets) delete[] h_segment_offsets;
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+}
+
+/** 
+ * Test different NumItemsT, i.e. types of num_items 
+ */
+template <typename KeyT>
+void TestNumItems(KeyT         *h_keys,
+                  std::size_t  num_items,
+                  int          max_segments,
+                  bool         pre_sorted)
+{
+    if (!pre_sorted && num_items <= std::size_t(std::numeric_limits<int>::max()))
+    {
+        TestSegments<KeyT, int>(h_keys, static_cast<int>(num_items), max_segments, pre_sorted);
+    }
+    if (pre_sorted && num_items <= std::size_t(std::numeric_limits<std::uint32_t>::max()))
+    {
+        TestSegments<KeyT, std::uint32_t>(h_keys, static_cast<std::uint32_t>(num_items), max_segments, pre_sorted);
+    }
+    TestSegments<KeyT, std::size_t>(h_keys, num_items, max_segments, pre_sorted);
+}
+
+
+/**
+ * Test different (sub)lengths and number of segments
+ */
+template <typename KeyT>
+void TestSizes(KeyT* h_keys,
+               std::size_t max_items,
+               int max_segments,
+               bool pre_sorted)
+{
+    if (pre_sorted)
+    {
+        // run a specific list of sizes, up to max_items
+        std::size_t sizes[] = {g_smallest_pre_sorted_num_items, 4350000007ull};
+        for (std::size_t num_items : sizes)
+        {
+            if (num_items > max_items) break;
+            TestNumItems(h_keys, num_items, max_segments, pre_sorted);
+        }
+    }
+    else
+    {
+        for (std::size_t num_items = max_items;
+             num_items > 1;
+             num_items = cub::DivideAndRoundUp(num_items, 64))
+        {
+            TestNumItems(h_keys, num_items, max_segments, pre_sorted);
+        }
+    }
+}
+
+/**
+ * Test key sampling distributions
+ */
+template <typename KeyT, bool WITH_PRE_SORTED>
+void TestGen(
+    std::size_t     max_items,
+    int             max_segments)
+{
+    if (max_items == ~std::size_t(0))
+    {
+        max_items = 8000003;
+    }
+
+    if (max_segments < 0)
+    {
+        max_segments = 5003;
+    }
+
+    std::unique_ptr<KeyT[]> h_keys(new KeyT[max_items]);
+
+    // Test trivial problems sizes
+    h_keys[0] = static_cast<KeyT>(42);
+    TestNumItems(h_keys.get(), 0, 0, false);
+    TestNumItems(h_keys.get(), 1, 1, false);
+
+    for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 6)
+    {
+        printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
+        InitializeKeyBits(RANDOM, h_keys.get(), max_items, entropy_reduction);
+        TestSizes(h_keys.get(), max_items, max_segments, false);
+    }
+
+    if (cub::Traits<KeyT>::CATEGORY == cub::FLOATING_POINT)
+    {
+        printf("\nTesting random %s keys with some replaced with -0.0 or +0.0 \n", typeid(KeyT).name());
+        fflush(stdout);
+        InitializeKeyBits(RANDOM_MINUS_PLUS_ZERO, h_keys.get(), max_items, 0);
+        // This just tests +/- 0 handling -- don't need to test multiple sizes
+        TestNumItems(h_keys.get(), max_items, max_segments, false);
+    }
+
+    printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(UNIFORM, h_keys.get(), max_items, 0);
+    TestSizes(h_keys.get(), max_items, max_segments, false);
+
+    printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(INTEGER_SEED, h_keys.get(), max_items, 0);
+    TestSizes(h_keys.get(), max_items, max_segments, false);
+
+    if (WITH_PRE_SORTED)
+    {
+        // Presorting is only used for testing large input arrays.
+        const std::size_t large_num_items = std::size_t(4350000007ull);
+
+        // A conservative check for memory, as we don't know ValueT or whether
+        // the overwrite is allowed until later.
+        // For ValueT, the check is actually exact unless TEST_VALUE_TYPE == 3.
+        if (!HasEnoughMemory<KeyT, SmallestValueT>(large_num_items, true))
+        {
+            printf("Skipping the permutation-based test due to insufficient device memory\n");
+            return;
+        }
+
+        h_keys.reset(nullptr); // Explicitly free old buffer before allocating.
+        h_keys.reset(new KeyT[large_num_items]);
+
+        printf("\nTesting pre-sorted and randomly permuted %s keys\n", typeid(KeyT).name());
+        fflush(stdout);
+        InitializeKeysSorted(h_keys.get(), large_num_items);
+        fflush(stdout);
+        TestSizes(h_keys.get(), large_num_items, max_segments, true);
+        fflush(stdout);
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Simple test
+//---------------------------------------------------------------------
+
+template <
+    Backend     BACKEND,
+    typename    KeyT,
+    typename    ValueT,
+    bool        IS_DESCENDING>
+void Test(
+    std::size_t num_items,
+    int         num_segments,
+    GenMode     gen_mode,
+    int         entropy_reduction,
+    int         begin_bit,
+    int         end_bit)
+{
+    const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+    KeyT         *h_keys             = new KeyT[num_items];
+    std::size_t  *h_reference_ranks  = NULL;
+    KeyT         *h_reference_keys   = NULL;
+    ValueT       *h_values           = NULL;
+    ValueT       *h_reference_values = NULL;
+    size_t       *h_segment_offsets  = new std::size_t[num_segments + 1];
+
+    std::size_t* d_segment_offsets = nullptr;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(std::size_t) * (num_segments + 1)));
+
+    if (end_bit < 0)
+        end_bit = sizeof(KeyT) * 8;
+
+    InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
+    InitializeSegments(num_items, num_segments, h_segment_offsets);
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(std::size_t) * (num_segments + 1), cudaMemcpyHostToDevice));
+    InitializeSolution<IS_DESCENDING, !KEYS_ONLY>(
+        h_keys, num_items, num_segments, false, h_segment_offsets,
+        begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    if (!KEYS_ONLY)
+    {
+        h_values            = new ValueT[num_items];
+        h_reference_values  = new ValueT[num_items];
+
+        for (std::size_t i = 0; i < num_items; ++i)
+        {
+            InitValue(INTEGER_SEED, h_values[i], i);
+            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+        }
+    }
+    if (h_reference_ranks) delete[] h_reference_ranks;
+
+    printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
+    Test<BACKEND, IS_DESCENDING>(
+        h_keys, h_values,
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, h_reference_keys, h_reference_values);
+
+    if (h_keys)             delete[] h_keys;
+    if (h_reference_keys)   delete[] h_reference_keys;
+    if (h_values)           delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+}
+
+#if TEST_VALUE_TYPE == 0
+void TestUnspecifiedRanges()
+{
+  const std::size_t num_items = 1024 * 1024;
+  const std::size_t max_segments = 42;
+  const std::size_t avg_segment_size = num_items / max_segments;
+
+  for (int iteration = 0; iteration < 4; iteration++)
+  {
+    thrust::host_vector<int> h_offsets_begin;
+    thrust::host_vector<int> h_offsets_end;
+
+    h_offsets_begin.reserve(max_segments + 1);
+    h_offsets_end.reserve(max_segments + 1);
+
+    {
+      int offset = 0;
+
+      for (std::size_t sid = 0; sid < max_segments; sid++)
+      {
+        const int segment_size = 
+          static_cast<int>(RandomValue(avg_segment_size));
+
+        const bool segment_is_utilized = segment_size > 0 
+                                       && RandomValue(100) > 60;
+
+        if (segment_is_utilized)
+        {
+          h_offsets_begin.push_back(offset);
+          h_offsets_end.push_back(offset + segment_size);
+        }
+
+        offset += segment_size;
+      }
+
+      if (h_offsets_begin.empty())
+      {
+        h_offsets_begin.push_back(avg_segment_size);
+        h_offsets_end.push_back(num_items);
+      }
+    }
+
+    thrust::device_vector<int> keys(num_items);
+    thrust::device_vector<int> values(num_items);
+
+    thrust::sequence(keys.rbegin(), keys.rend());
+    thrust::sequence(values.rbegin(), values.rend());
+
+    thrust::device_vector<int> d_offsets_begin = h_offsets_begin;
+    thrust::device_vector<int> d_offsets_end = h_offsets_end;
+
+    thrust::device_vector<int> expected_keys = keys;
+    thrust::device_vector<int> expected_values = values;
+
+    const int num_segments = static_cast<int>(h_offsets_begin.size());
+
+    thrust::device_vector<int> result_keys = keys;
+    thrust::device_vector<int> result_values = values;
+
+    for (int sid = 0; sid < num_segments; sid++)
+    {
+      const int segment_begin = h_offsets_begin[sid];
+      const int segment_end = h_offsets_end[sid];
+
+      thrust::sort_by_key(expected_keys.begin() + segment_begin,
+                          expected_keys.begin() + segment_end,
+                          expected_values.begin() + segment_begin);
+    }
+
+    {
+      cub::DoubleBuffer<int> keys_buffer(
+          thrust::raw_pointer_cast(keys.data()), 
+          thrust::raw_pointer_cast(result_keys.data()));
+
+      cub::DoubleBuffer<int> values_buffer(
+          thrust::raw_pointer_cast(values.data()), 
+          thrust::raw_pointer_cast(result_values.data()));
+
+      std::size_t temp_storage_bytes{};
+      std::uint8_t *d_temp_storage{nullptr};
+
+      CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          keys_buffer, values_buffer, 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data()),
+          0, sizeof(int) * 8));
+
+      thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+      d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+      CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          keys_buffer, values_buffer, 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data()),
+          0, sizeof(int) * 8));
+
+      for (int sid = 0; sid < num_segments; sid++)
+      {
+        const int segment_begin = h_offsets_begin[sid];
+        const int segment_end = h_offsets_end[sid];
+
+        if (keys_buffer.selector == 0)
+        {
+          thrust::copy(
+              keys.begin() + segment_begin,
+              keys.begin() + segment_end,
+              result_keys.begin() + segment_begin);
+        }
+                       
+        if (values_buffer.selector == 0)
+        {
+          thrust::copy(
+              values.begin() + segment_begin,
+              values.begin() + segment_end,
+              result_values.begin() + segment_begin);
+        }
+      }
+    }
+
+    AssertEquals(result_keys, expected_keys); 
+    AssertEquals(result_values, expected_values);
+
+    thrust::sequence(keys.rbegin(), keys.rend());
+    thrust::sequence(values.rbegin(), values.rend());
+
+    result_keys = keys;
+    result_values = values;
+
+    {
+      std::size_t temp_storage_bytes{};
+      std::uint8_t *d_temp_storage{};
+
+      CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          thrust::raw_pointer_cast(keys.data()), 
+          thrust::raw_pointer_cast(result_keys.data()), 
+          thrust::raw_pointer_cast(values.data()), 
+          thrust::raw_pointer_cast(result_values.data()), 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data()),
+          0, sizeof(int) * 8));
+
+      thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+      d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+      CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          thrust::raw_pointer_cast(keys.data()), 
+          thrust::raw_pointer_cast(result_keys.data()), 
+          thrust::raw_pointer_cast(values.data()), 
+          thrust::raw_pointer_cast(result_values.data()), 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data()),
+          0, sizeof(int) * 8));
+    }
+
+    AssertEquals(result_values, expected_values);
+    AssertEquals(result_keys, expected_keys);
+  }
+}
+#endif
+
+#if TEST_KEY_BYTES == 4
+// Following tests check that new decomposer API doesn't break old API. 
+// It's disabled because some compilers don't like implicit conversions, which
+// is required for the test. Once we figure out how to temporarily enable conversion, we can
+// re-enable the test.
+#define ENABLING_CONVERSION_IS_FIGURED_OUT 0
+#if ENABLING_CONVERSION_IS_FIGURED_OUT 
+struct bit_selector
+{
+  int bit;
+
+  operator int() const
+  {
+    return bit;
+  }
+};
+
+template <class BeginBitT, class EndBitT, class... Ts>
+void device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit, EndBitT end_bit, Ts... args)
+{
+  const int num_items = 0;
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   args...,
+                                   num_items,
+                                   begin_bit,
+                                   end_bit);
+  }
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                   temp_storage_bytes,
+                                   args...,
+                                   num_items,
+                                   begin_bit);
+  }
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             args...,
+                                             num_items,
+                                             begin_bit,
+                                             end_bit);
+  }
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                             temp_storage_bytes,
+                                             args...,
+                                             num_items,
+                                             begin_bit);
+  }
+}
+
+template <class BeginBitT, class EndBitT, class... Ts>
+void device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit,
+                                                                         EndBitT end_bit,
+                                                                         Ts... args)
+{
+  const int num_items = 0;
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    args...,
+                                    num_items,
+                                    begin_bit,
+                                    end_bit);
+  }
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                    temp_storage_bytes,
+                                    args...,
+                                    num_items,
+                                    begin_bit);
+  }
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              args...,
+                                              num_items,
+                                              begin_bit,
+                                              end_bit);
+  }
+
+  {
+    std::size_t temp_storage_bytes = 0;
+    std::uint8_t *d_temp_storage   = nullptr;
+
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                              temp_storage_bytes,
+                                              args...,
+                                              num_items,
+                                              begin_bit);
+  }
+}
+
+template <class BeginBitT, class EndBitT>
+void device_radix_sort_allows_implicit_conversions_for_bits(BeginBitT begin_bit, EndBitT end_bit)
+{
+  int *d_i_ptr = nullptr;
+
+  device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit,
+                                                                     end_bit,
+                                                                     d_i_ptr,
+                                                                     d_i_ptr);
+
+  cub::DoubleBuffer<int> keys(d_i_ptr, d_i_ptr);
+  device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit, end_bit, keys);
+
+  #if TEST_VALUE_TYPE == 2
+  unsigned int *d_u_ptr = nullptr;
+
+  device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit,
+                                                                      end_bit,
+                                                                      d_i_ptr,
+                                                                      d_i_ptr,
+                                                                      d_u_ptr,
+                                                                      d_u_ptr);
+
+  cub::DoubleBuffer<unsigned int> pairs(d_u_ptr, d_u_ptr);
+  device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit,
+                                                                      end_bit,
+                                                                      keys,
+                                                                      pairs);
+  #endif
+}
+
+void device_radix_sort_allows_implicit_conversions_for_bits()
+{
+  int begin_i = 0;
+  long long int begin_lli = 0;
+  bit_selector begin_bs{0};
+
+  int end_i = 2;
+  long long int end_lli = 2;
+  bit_selector end_bs{2};
+
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_i);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_lli);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_bs);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_i);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_lli);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_bs);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_i);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_lli);
+  device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_bs);
+}
+#endif // ENABLING_CONVERSION_IS_FIGURED_OUT 
+#endif // TEST_KEY_BYTES == 4
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    std::size_t num_items = ~std::size_t(0);
+    int num_segments = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("s", num_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // %PARAM% TEST_CDP cdp 0:1
+    // %PARAM% TEST_KEY_BYTES bytes 1:2:4:8:16
+    // %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3
+    //   0->Keys only
+    //   1->uchar
+    //   2->uint
+    //   3->[ull,TestBar] (TestBar is cheap to build, included here to
+    //                     reduce total number of targets)
+
+    // To reduce testing time, some key types are only tested when not
+    // testing pairs:
+#if TEST_VALUE_TYPE == 0
+#define TEST_EXTENDED_KEY_TYPES
+#endif
+
+    // Compile/run thorough tests
+#if TEST_KEY_BYTES == 1
+
+    TestGen<signed char, true>        (num_items, num_segments);
+
+#ifdef TEST_EXTENDED_KEY_TYPES
+    TestGen<bool, false>              (num_items, num_segments);
+    TestGen<signed char, false>       (num_items, num_segments);
+    TestGen<unsigned char, false>     (num_items, num_segments);
+#endif // TEST_EXTENDED_KEY_TYPES
+
+#elif TEST_KEY_BYTES == 2
+    TestGen<unsigned short, true>     (num_items, num_segments);
+
+#ifdef TEST_EXTENDED_KEY_TYPES
+    TestGen<short, false>             (num_items, num_segments);
+
+#if !_NVHPC_CUDA
+    TestGen<half_t, false>            (num_items, num_segments);
+#endif // CTK >= 9
+
+#if !_NVHPC_CUDA
+#if !defined(__ICC)
+    // Fails with `-0 != 0` with ICC for unknown reasons. See #333.
+    TestGen<bfloat16_t, false>        (num_items, num_segments);
+#endif // !ICC
+#endif // CTK >= 11
+
+#endif // TEST_EXTENDED_KEY_TYPES
+
+#elif TEST_KEY_BYTES == 4
+
+    TestGen<int, true>                (num_items, num_segments);
+
+#if TEST_VALUE_TYPE == 0
+    TestUnspecifiedRanges();
+#endif
+
+#if ENABLING_CONVERSION_IS_FIGURED_OUT 
+    device_radix_sort_allows_implicit_conversions_for_bits();
+#endif
+
+#ifdef TEST_EXTENDED_KEY_TYPES
+    TestGen<float, false>             (num_items, num_segments);
+    TestGen<unsigned int, false>      (num_items, num_segments);
+#endif // TEST_EXTENDED_KEY_TYPES
+
+#elif TEST_KEY_BYTES == 8
+
+    TestGen<double, true>             (num_items, num_segments);
+
+#ifdef TEST_EXTENDED_KEY_TYPES
+    TestGen<long long, false>         (num_items, num_segments);
+    TestGen<unsigned long long, false>(num_items, num_segments);
+#endif // TEST_EXTENDED_KEY_TYPES
+
+#elif TEST_KEY_BYTES == 16
+
+#if CUB_IS_INT128_ENABLED 
+    TestGen<__int128_t,  false>(num_items, num_segments);
+    TestGen<__uint128_t, false>(num_items, num_segments);
+#else
+    // Fix unused static function for MSVC
+    BackendToString(CUB);
+#endif
+
+#endif // TEST_KEY_BYTES switch
+
+    return 0;
+}
diff --git a/include/cub/test/test_device_reduce.cu b/include/cub/test/test_device_reduce.cu
new file mode 100644
index 0000000..869fde2
--- /dev/null
+++ b/include/cub/test/test_device_reduce.cu
@@ -0,0 +1,1916 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_segmented_reduce.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdio>
+#include <limits>
+#include <type_traits>
+#include <typeinfo>
+
+#include "test_util.h"
+#include <nv/target>
+
+#define TEST_HALF_T !_NVHPC_CUDA
+
+#define TEST_BF_T !_NVHPC_CUDA
+
+#if TEST_HALF_T
+#include <cuda_fp16.h>
+
+// Half support is provided by SM53+. We currently test against a few older architectures. 
+// The specializations below can be removed once we drop these architectures.  
+namespace cub {
+
+template <>
+__host__ __device__ __forceinline__ //
+__half Min::operator()(__half& a, __half& b) const 
+{
+    NV_IF_TARGET(NV_PROVIDES_SM_53, 
+                    (return CUB_MIN(a, b);),
+                    (return CUB_MIN(__half2float(a), __half2float(b));));
+}
+
+template <>
+__host__ __device__ __forceinline__ //
+KeyValuePair<int, __half> 
+ArgMin::operator()(const KeyValuePair<int, __half> &a, 
+                   const KeyValuePair<int, __half> &b) const 
+{
+    const float av = __half2float(a.value);
+    const float bv = __half2float(b.value);
+
+    if ((bv < av) || ((av == bv) && (b.key < a.key)))
+    {
+      return b;
+    }
+
+    return a;
+}
+
+template <>
+__host__ __device__ __forceinline__ //
+__half Max::operator()(__half& a, __half& b) const 
+{
+    NV_IF_TARGET(NV_PROVIDES_SM_53, 
+                    (return CUB_MAX(a, b);),
+                    (return CUB_MAX(__half2float(a), __half2float(b));));
+}
+
+template <>
+__host__ __device__ __forceinline__ //
+KeyValuePair<int, __half> 
+ArgMax::operator()(const KeyValuePair<int, __half> &a, 
+                   const KeyValuePair<int, __half> &b) const 
+{
+    const float av = __half2float(a.value);
+    const float bv = __half2float(b.value);
+
+    if ((bv > av) || ((av == bv) && (b.key < a.key)))
+    {
+      return b;
+    }
+
+    return a;
+}
+
+} // namespace cub
+#endif
+
+#if TEST_BF_T
+#include <cuda_bf16.h>
+#endif
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+int                     g_ptx_version;
+int                     g_sm_count;
+double                  g_device_giga_bandwidth;
+bool                    g_verbose           = false;
+bool                    g_verbose_input     = false;
+int                     g_timing_iterations = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,            // CUB method
+    CUB_SEGMENTED,  // CUB segmented method
+    CDP,            // GPU-based (dynamic parallelism) dispatch to CUB method
+    CDP_SEGMENTED,  // GPU-based segmented method
+};
+
+inline const char* BackendToString(Backend b)
+{
+  switch (b)
+  {
+    case CUB:
+      return "CUB";
+    case CUB_SEGMENTED:
+      return "CUB_SEGMENTED";
+    case CDP:
+      return "CDP";
+    case CDP_SEGMENTED:
+      return "CDP_SEGMENTED";
+    default:
+      break;
+  }
+
+  return "";
+}
+
+// Custom max functor
+struct CustomMax
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T, typename C>
+    __host__ __device__ auto operator()(T&& a, C&& b) 
+      -> cub::detail::accumulator_t<cub::Max, T, C>
+    {
+        return CUB_MAX(a, b);
+    }
+
+#if TEST_HALF_T
+    __host__ __device__ __half operator()(__half& a, __half& b) 
+    {
+        return cub::Max{}(a, b);
+    }
+#endif
+};
+
+// Comparing results computed on CPU and GPU for extended floating point types is impossible. 
+// For instance, when used with a constant iterator of two, the accumulator in sequential reference 
+// computation (CPU) bumps into the 4096 limits, which will never change (`4096 + 2 = 4096`). 
+// Meanwhile, per-thread aggregates (`2 * 16 = 32`) are accumulated within and among thread blocks, 
+// yielding `inf` as a result. No reasonable epsilon can be selected to compare `inf` with `4096`. 
+// To make `__half` and `__nv_bfloat16` arithmetic associative, the function object below raises 
+// extended floating points to the area of unsigned short integers. This allows us to test large 
+// inputs with few code-path differences in device algorithms. 
+struct ExtendedFloatSum
+{
+    template <class T>
+    __host__ __device__ T operator()(T a, T b) const
+    {
+        T result{};
+        result.__x = a.raw() + b.raw();
+        return result;
+    }
+
+#if TEST_HALF_T
+    __host__ __device__ __half operator()(__half a, __half b) const
+    {
+        uint16_t result = this->operator()(half_t{a}, half_t(b)).raw();
+        return reinterpret_cast<__half &>(result);
+    }
+#endif
+
+#if TEST_BF_T
+    __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const
+    {
+        uint16_t result = this->operator()(bfloat16_t{a}, bfloat16_t(b)).raw();
+        return reinterpret_cast<__nv_bfloat16 &>(result);
+    }
+#endif
+};
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT  /*d_segment_end_offsets*/,
+    ReductionOpT        reduction_op)
+{
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    // The output value type
+    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, reduction_op, identity);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB>         /*dispatch_to*/,
+    int                     timing_iterations,
+    size_t *              /*d_temp_storage_bytes*/,
+    cudaError_t *         /*d_cdp_error*/,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    InputIteratorT          d_in,
+    OutputIteratorT         d_out,
+    int                     num_items,
+    int                   /*max_segments*/,
+    BeginOffsetIteratorT  /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT    /*d_segment_end_offsets*/,
+    cub::Sum              /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to extended fp sum entrypoint
+ */
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type<CUB> /*dispatch_to*/,
+                                          int timing_iterations,
+                                          size_t * /*d_temp_storage_bytes*/,
+                                          cudaError_t * /*d_cdp_error*/,
+
+                                          void *d_temp_storage,
+                                          size_t &temp_storage_bytes,
+                                          InputIteratorT d_in,
+                                          OutputIteratorT d_out,
+                                          int num_items,
+                                          int /*max_segments*/,
+                                          BeginOffsetIteratorT /*d_segment_begin_offsets*/,
+                                          EndOffsetIteratorT /*d_segment_end_offsets*/,
+                                          ExtendedFloatSum reduction_op)
+{
+    using InputT  = cub::detail::value_t<InputIteratorT>;
+    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;
+
+    OutputT identity{};
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Reduce(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     num_items,
+                                     reduction_op,
+                                     identity);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT  /*d_segment_end_offsets*/,
+    cub::Min            /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT  /*d_segment_end_offsets*/,
+    cub::Max            /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT  /*d_segment_end_offsets*/,
+    cub::ArgMin         /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 /*max_segments*/,
+    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
+    EndOffsetIteratorT  /*d_segment_end_offsets*/,
+    cub::ArgMax         /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+    }
+
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSegmentedReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT  d_segment_end_offsets,
+    ReductionOpT        reduction_op)
+{
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    // The output value type
+    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, identity);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT  d_segment_end_offsets,
+    cub::Sum            /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to extended fp sum entrypoint
+ */
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
+                                          int timing_iterations,
+                                          size_t * /*d_temp_storage_bytes*/,
+                                          cudaError_t * /*d_cdp_error*/,
+
+                                          void *d_temp_storage,
+                                          size_t &temp_storage_bytes,
+                                          InputIteratorT d_in,
+                                          OutputIteratorT d_out,
+                                          int /*num_items*/,
+                                          int max_segments,
+                                          BeginOffsetIteratorT d_segment_begin_offsets,
+                                          EndOffsetIteratorT d_segment_end_offsets,
+                                          ExtendedFloatSum reduction_op)
+{
+    using InputT  = cub::detail::value_t<InputIteratorT>;
+    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;
+
+    OutputT identity{};
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Reduce(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_in,
+                                              d_out,
+                                              max_segments,
+                                              d_segment_begin_offsets,
+                                              d_segment_end_offsets,
+                                              reduction_op,
+                                              identity);
+    }
+
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT  d_segment_end_offsets,
+    cub::Min            /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT  d_segment_end_offsets,
+    cub::Max            /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT  d_segment_end_offsets,
+    cub::ArgMin         /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
+    int                 timing_iterations,
+    size_t              */*d_temp_storage_bytes*/,
+    cudaError_t         */*d_cdp_error*/,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 /*num_items*/,
+    int                 max_segments,
+    BeginOffsetIteratorT d_segment_begin_offsets,
+    EndOffsetIteratorT  d_segment_end_offsets,
+    cub::ArgMax         /*reduction_op*/)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA nested-parallelism test kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceReduce
+ */
+template <int CubBackend,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename ReductionOpT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  InputIteratorT       d_in,
+                                  OutputIteratorT      d_out,
+                                  int                  num_items,
+                                  int                  max_segments,
+                                  BeginOffsetIteratorT d_segment_begin_offsets,
+                                  EndOffsetIteratorT   d_segment_end_offsets,
+                                  ReductionOpT         reduction_op)
+{
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_items,
+                          max_segments,
+                          d_segment_begin_offsets,
+                          d_segment_end_offsets,
+                          reduction_op);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Launch kernel and dispatch on device. Should only be called from host code.
+ * The CubBackend should be one of the non-CDP CUB backends to invoke from the
+ * device.
+ */
+template <int CubBackend,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename ReductionOpT>
+cudaError_t LaunchCDPKernel(Int2Type<CubBackend> cub_backend,
+                            int                  timing_iterations,
+                            size_t              *d_temp_storage_bytes,
+                            cudaError_t         *d_cdp_error,
+
+                            void                *d_temp_storage,
+                            size_t              &temp_storage_bytes,
+                            InputIteratorT       d_in,
+                            OutputIteratorT      d_out,
+                            int                  num_items,
+                            int                  max_segments,
+                            BeginOffsetIteratorT d_segment_begin_offsets,
+                            EndOffsetIteratorT   d_segment_end_offsets,
+                            ReductionOpT         reduction_op)
+{
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<CubBackend,
+                              InputIteratorT,
+                              OutputIteratorT,
+                              BeginOffsetIteratorT,
+                              EndOffsetIteratorT,
+                              ReductionOpT>,
+            cub_backend,
+            timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            max_segments,
+            d_segment_begin_offsets,
+            d_segment_end_offsets,
+            reduction_op);
+  CubDebugExit(retval);
+  CubDebugExit(cub::detail::device_synchronize());
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  return retval;
+}
+
+// Specializations of Dispatch that translate the CDP backend to the appropriate
+// CUB backend, and uses the CUB backend to launch the CDP kernel.
+#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend)                          \
+  template <typename InputIteratorT,                                           \
+            typename OutputIteratorT,                                          \
+            typename BeginOffsetIteratorT,                                     \
+            typename EndOffsetIteratorT,                                       \
+            typename ReductionOpT>                                             \
+  cudaError_t Dispatch(Int2Type<CdpBackend>,                                   \
+                       int          timing_iterations,                         \
+                       size_t      *d_temp_storage_bytes,                      \
+                       cudaError_t *d_cdp_error,                               \
+                                                                               \
+                       void                *d_temp_storage,                    \
+                       size_t              &temp_storage_bytes,                \
+                       InputIteratorT       d_in,                              \
+                       OutputIteratorT      d_out,                             \
+                       int                  num_items,                         \
+                       int                  max_segments,                      \
+                       BeginOffsetIteratorT d_segment_begin_offsets,           \
+                       EndOffsetIteratorT   d_segment_end_offsets,             \
+                       ReductionOpT         reduction_op)                      \
+  {                                                                            \
+    Int2Type<CubBackend> cub_backend{};                                        \
+    return LaunchCDPKernel(cub_backend,                                        \
+                           timing_iterations,                                  \
+                           d_temp_storage_bytes,                               \
+                           d_cdp_error,                                        \
+                           d_temp_storage,                                     \
+                           temp_storage_bytes,                                 \
+                           d_in,                                               \
+                           d_out,                                              \
+                           num_items,                                          \
+                           max_segments,                                       \
+                           d_segment_begin_offsets,                            \
+                           d_segment_end_offsets,                              \
+                           reduction_op);                                      \
+  }
+
+DEFINE_CDP_DISPATCHER(CDP, CUB)
+DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)
+
+#undef DEFINE_CDP_DISPATCHER
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+/// Initialize problem
+template <typename InputT>
+void Initialize(
+    GenMode         gen_mode,
+    InputT          *h_in,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose_input)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/// Solve problem (max/custom-max functor)
+template <typename ReductionOpT, typename InputT, typename _OutputT>
+struct Solution
+{
+    using OutputT = _OutputT;
+    using InitT = OutputT;
+    using AccumT = cub::detail::accumulator_t<ReductionOpT, InitT, InputT>;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
+        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            AccumT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+/// Solve problem (min functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Min, InputT, _OutputT>
+{
+    using OutputT = _OutputT;
+    using InitT = OutputT;
+    using AccumT = cub::detail::accumulator_t<cub::Min, InitT, InputT>;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
+        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::Min reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            AccumT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+/// Solve problem (sum functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Sum, InputT, _OutputT>
+{
+    using OutputT = _OutputT;
+    using InitT = OutputT;
+    using AccumT = cub::detail::accumulator_t<cub::Sum, InitT, InputT>;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
+        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            AccumT aggregate{};
+            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
+                aggregate = reduction_op(aggregate, h_in[j]);
+            h_reference[i] = static_cast<OutputT>(aggregate);
+        }
+    }
+};
+
+template <typename InputT, typename _OutputT>
+struct Solution<ExtendedFloatSum, InputT, _OutputT>
+{
+    using OutputT = _OutputT;
+    using InitT   = OutputT;
+    using AccumT  = cub::detail::accumulator_t<cub::Sum, InitT, InputT>;
+
+    template <typename HostInputIteratorT,
+              typename OffsetT,
+              typename BeginOffsetIteratorT,
+              typename EndOffsetIteratorT,
+              typename ReductionOpT>
+    static void Solve(HostInputIteratorT h_in,
+                      OutputT *h_reference,
+                      OffsetT num_segments,
+                      BeginOffsetIteratorT h_segment_begin_offsets,
+                      EndOffsetIteratorT h_segment_end_offsets,
+                      ReductionOpT reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            AccumT aggregate{};
+            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
+                aggregate = reduction_op(aggregate, h_in[j]);
+            h_reference[i] = static_cast<OutputT>(aggregate);
+        }
+    }
+};
+
+/// Solve problem (argmin functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMin, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
+        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMin reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            const auto segment_begin = h_segment_begin_offsets[i];
+            const auto segment_end = h_segment_end_offsets[i];
+
+            if (segment_begin < segment_end) 
+            {
+                OutputT aggregate(0, OutputValueT(h_in[segment_begin]));
+                for (int j = segment_begin + 1; j < segment_end; ++j)
+                {
+                    OutputT item(j - segment_begin, OutputValueT(h_in[j]));
+                    aggregate = reduction_op(aggregate, item);
+                }
+                h_reference[i] = aggregate;
+            }
+            else 
+            {
+                // Guaranteed output for empty segments
+                OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+                h_reference[i] = aggregate;
+            }
+        }
+    }
+};
+
+
+/// Solve problem (argmax functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMax, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
+        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMax reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            const auto segment_begin = h_segment_begin_offsets[i];
+            const auto segment_end = h_segment_end_offsets[i];
+
+            if (segment_begin < segment_end) 
+            {
+                OutputT aggregate(0, OutputValueT(h_in[segment_begin])); 
+                for (int j = segment_begin + 1; j < segment_end; ++j)
+                {
+                    OutputT item(j - segment_begin, OutputValueT(h_in[j]));
+                    aggregate = reduction_op(aggregate, item);
+                }
+                h_reference[i] = aggregate;
+            }
+            else 
+            {
+                OutputT aggregate(1, Traits<InputValueT>::Lowest());
+                h_reference[i] = aggregate;
+            }
+        }
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+template <class It>
+It unwrap_it(It it) {
+    return it;
+}
+
+#if TEST_HALF_T
+__half* unwrap_it(half_t *it) {
+    return reinterpret_cast<__half*>(it);
+}
+
+template <class OffsetT>
+ConstantInputIterator<__half, OffsetT> unwrap_it(ConstantInputIterator<half_t, OffsetT> it) {
+    half_t wrapped_val = *it;
+    __half val = wrapped_val.operator __half();
+    return ConstantInputIterator<__half, OffsetT>(val);
+}
+#endif
+
+#if TEST_BF_T
+__nv_bfloat16* unwrap_it(bfloat16_t* it) {
+    return reinterpret_cast<__nv_bfloat16*>(it);
+}
+
+template <class OffsetT>
+ConstantInputIterator<__nv_bfloat16, OffsetT> unwrap_it(ConstantInputIterator<bfloat16_t, OffsetT> it) {
+    bfloat16_t wrapped_val = *it;
+    __nv_bfloat16 val = wrapped_val.operator __nv_bfloat16();
+    return ConstantInputIterator<__nv_bfloat16, OffsetT>(val);
+}
+#endif
+
+template <class WrappedItT, //
+          class ItT = decltype(unwrap_it(std::declval<WrappedItT>()))>
+std::integral_constant<bool, !std::is_same<WrappedItT, ItT>::value> //
+reference_extended_fp(WrappedItT)
+{
+    return {};
+}
+
+ExtendedFloatSum unwrap_op(std::true_type /* extended float */, cub::Sum) //
+{
+    return {};
+}
+
+template <bool V, class OpT>
+OpT unwrap_op(std::integral_constant<bool, V> /* base case */, OpT op)
+{
+    return op;
+}
+
+/// Test DeviceReduce for a given problem input
+template <
+    typename                BackendT,
+    typename                DeviceWrappedInputIteratorT,
+    typename                DeviceWrappedOutputIteratorT,
+    typename                HostReferenceIteratorT,
+    typename                OffsetT,
+    typename                BeginOffsetIteratorT,
+    typename                EndOffsetIteratorT,
+    typename                ReductionOpT>
+void Test(
+    BackendT                        backend,
+    DeviceWrappedInputIteratorT     d_wrapped_in,
+    DeviceWrappedOutputIteratorT    d_wrapped_out,
+    OffsetT                         num_items,
+    OffsetT                         num_segments,
+    BeginOffsetIteratorT            d_segment_begin_offsets,
+    EndOffsetIteratorT              d_segment_end_offsets,
+    ReductionOpT                    reduction_op,
+    HostReferenceIteratorT          h_reference)
+{
+    // Input data types
+    auto d_in = unwrap_it(d_wrapped_in);
+    auto d_out = unwrap_it(d_wrapped_out);
+
+    // Allocate CDP device arrays for temp storage size and error
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Inquire temp device storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
+        reduction_op));
+
+    // Allocate temp device storage
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
+        reduction_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_wrapped_out, num_segments, g_verbose, g_verbose);
+
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    if (g_timing_iterations > 0)
+    {
+        GpuTimer gpu_timer;
+        gpu_timer.Start();
+
+        CubDebugExit(Dispatch(backend, g_timing_iterations,
+            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
+            reduction_op));
+
+        gpu_timer.Stop();
+        float elapsed_millis = gpu_timer.ElapsedMillis();
+
+        // Display performance
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(h_reference[0]);
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
+            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+template <Backend BACKEND,
+          typename OutputValueT,
+          typename HostInputIteratorT,
+          typename DeviceInputIteratorT,
+          typename OffsetT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename ReductionOpT>
+void SolveAndTest(HostInputIteratorT h_in,
+                  DeviceInputIteratorT d_in,
+                  OffsetT num_items,
+                  OffsetT num_segments,
+                  BeginOffsetIteratorT h_segment_begin_offsets,
+                  EndOffsetIteratorT h_segment_end_offsets,
+                  BeginOffsetIteratorT d_segment_begin_offsets,
+                  EndOffsetIteratorT d_segment_end_offsets,
+                  ReductionOpT wrapped_reduction_op)
+{
+    auto reduction_op = unwrap_op(reference_extended_fp(d_in), wrapped_reduction_op);
+
+    using InputValueT = cub::detail::value_t<DeviceInputIteratorT>;
+    using SolutionT = Solution<decltype(reduction_op), InputValueT, OutputValueT>;
+    using OutputT = typename SolutionT::OutputT;
+
+    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
+           BackendToString(BACKEND),
+           typeid(ReductionOpT).name(),
+           num_items,
+           typeid(HostInputIteratorT).name(),
+           num_segments);
+    fflush(stdout);
+
+    // Allocate and solve solution
+    OutputT *h_reference = new OutputT[num_segments];
+    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_begin_offsets, h_segment_end_offsets, reduction_op);
+
+    // Run with discard iterator
+    DiscardOutputIterator<OffsetT> discard_itr;
+    Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference);
+
+    // Run with output data
+    OutputT *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
+    Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (h_reference) delete[] h_reference;
+}
+
+/// Test specific problem type
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        OffsetT,
+    typename        ReductionOpT>
+void TestProblem(
+    OffsetT         num_items,
+    OffsetT         num_segments,
+    GenMode         gen_mode,
+    ReductionOpT    reduction_op)
+{
+    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    fflush(stdout);
+
+    // Initialize value data
+    InputT* h_in = new InputT[num_items];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize segment data
+    OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
+    InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+
+    // Initialize device data
+    OffsetT *d_segment_offsets      = NULL;
+    InputT  *d_in                   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
+        h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1, reduction_op);
+
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (h_in)               delete[] h_in;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/// Test different operators
+template <
+    Backend             BACKEND,
+    typename            OutputT,
+    typename            HostInputIteratorT,
+    typename            DeviceInputIteratorT,
+    typename            OffsetT,
+    typename            BeginOffsetIteratorT,
+    typename            EndOffsetIteratorT>
+void TestByOp(
+    HostInputIteratorT      h_in,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    BeginOffsetIteratorT    h_segment_begin_offsets,
+    EndOffsetIteratorT      h_segment_end_offsets,
+    BeginOffsetIteratorT    d_segment_begin_offsets,
+    EndOffsetIteratorT      d_segment_end_offsets)
+{
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
+        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, CustomMax());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
+        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Sum());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
+        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Min());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
+        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMin());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
+        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Max());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
+        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMax());
+}
+
+template<typename OffsetT>
+struct TransformFunctor1
+{
+    __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
+    {
+        return offset;
+    }
+};
+
+template<typename OffsetT>
+struct TransformFunctor2
+{
+    __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
+    {
+        return offset;
+    }
+};
+
+/// Test different backends
+template <
+    typename    InputT,
+    typename    OutputT,
+    typename    OffsetT>
+void TestByBackend(
+    OffsetT     num_items,
+    OffsetT     max_segments,
+    GenMode     gen_mode)
+{
+#if TEST_CDP == 0
+  constexpr auto NonSegmentedBackend   = CUB;
+  constexpr auto SegmentedBackend      = CUB_SEGMENTED;
+#else  // TEST_CDP
+  constexpr auto NonSegmentedBackend   = CDP;
+  constexpr auto SegmentedBackend      = CDP_SEGMENTED;
+#endif // TEST_CDP
+
+    // Initialize host data
+    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
+        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+
+    InputT  *h_in               = new InputT[num_items];
+    OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize device data
+    InputT  *d_in               = NULL;
+    OffsetT *d_segment_offsets  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    //
+    // Test single-segment implementations
+    //
+
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+    // Page-aligned-input tests
+    TestByOp<NonSegmentedBackend, OutputT>(h_in, d_in, num_items, 1,
+        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);
+
+    // Non-page-aligned-input tests
+    if (num_items > 1)
+    {
+        InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
+        TestByOp<NonSegmentedBackend, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1,
+            h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);
+    }
+
+    //
+    // Test segmented implementation
+    //
+
+    // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+    int max_items_per_segment = 128000;
+
+    for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment);
+        num_segments < max_segments;
+        num_segments = (num_segments * 32) + 1)
+    {
+        // Test with segment pointer
+        InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+        CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
+            h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1);
+
+        // Test with segment iterator
+        typedef CastOp<OffsetT> IdentityOpT;
+        IdentityOpT identity_op;
+        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
+            h_segment_offsets,
+            identity_op);
+        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
+            d_segment_offsets,
+            identity_op);
+
+        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
+            h_segment_offsets_itr, h_segment_offsets_itr + 1, d_segment_offsets_itr, d_segment_offsets_itr + 1);
+
+        // Test with transform iterators of different types
+
+        typedef TransformFunctor1<OffsetT> TransformFunctor1T;
+        typedef TransformFunctor2<OffsetT> TransformFunctor2T;
+
+        TransformInputIterator<OffsetT, TransformFunctor1T, OffsetT*, OffsetT> h_segment_begin_offsets_itr(h_segment_offsets, TransformFunctor1T());
+        TransformInputIterator<OffsetT, TransformFunctor2T, OffsetT*, OffsetT> h_segment_end_offsets_itr(h_segment_offsets + 1, TransformFunctor2T());
+
+        TransformInputIterator<OffsetT, TransformFunctor1T, OffsetT*, OffsetT> d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T());
+        TransformInputIterator<OffsetT, TransformFunctor2T, OffsetT*, OffsetT> d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T());
+
+        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
+            h_segment_begin_offsets_itr, h_segment_end_offsets_itr,
+            d_segment_begin_offsets_itr, d_segment_end_offsets_itr);
+    }
+
+    if (h_in)               delete[] h_in;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+}
+
+
+/// Test different input-generation modes
+template <
+    typename InputT,
+    typename OutputT,
+    typename OffsetT>
+void TestByGenMode(
+    OffsetT num_items,
+    OffsetT max_segments)
+{
+    //
+    // Test pointer support using different input-generation modes
+    //
+
+    TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
+
+    //
+    // Test iterator support using a constant-iterator and SUM
+    //
+
+    InputT val;
+    InitValue(UNIFORM, val, 0);
+    ConstantInputIterator<InputT, OffsetT> in(val);
+
+    OffsetT *h_segment_offsets = new OffsetT[1 + 1];
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+#if TEST_CDP == 0
+    constexpr auto Backend   = CUB;
+#else  // TEST_CDP
+    constexpr auto Backend   = CDP;
+#endif // TEST_CDP
+
+    SolveAndTest<Backend, OutputT>(in, in, num_items, 1,
+        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum());
+
+    if (h_segment_offsets) delete[] h_segment_offsets;
+}
+
+/// Test different problem sizes
+template <typename InputT, typename OutputT, typename OffsetT>
+void TestBySize(OffsetT max_items, OffsetT max_segments, OffsetT tile_size)
+{
+  // Test 0, 1, many
+  TestByGenMode<InputT, OutputT>(0, max_segments);
+  TestByGenMode<InputT, OutputT>(1, max_segments);
+  TestByGenMode<InputT, OutputT>(max_items, max_segments);
+
+  // Test random problem sizes from a log-distribution [8, max_items-ish)
+  int    num_iterations = 8;
+  double max_exp        = log(double(max_items)) / log(double(2.0));
+  for (int i = 0; i < num_iterations; ++i)
+  {
+    OffsetT num_items = (OffsetT)pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
+    TestByGenMode<InputT, OutputT>(num_items, max_segments);
+  }
+
+  //
+  // White-box testing of single-segment problems around specific sizes
+  //
+
+#if TEST_CDP == 0
+  constexpr auto Backend   = CUB;
+#else  // TEST_CDP
+  constexpr auto Backend   = CDP;
+#endif // TEST_CDP
+
+  // Tile-boundaries: multiple blocks, one tile per block
+  TestProblem<Backend, InputT, OutputT>(tile_size * 4, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(tile_size * 4 + 1, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(tile_size * 4 - 1, 1, RANDOM, Sum());
+
+  // Tile-boundaries: multiple blocks, multiple tiles per block
+  OffsetT sm_occupancy = 32;
+  OffsetT occupancy    = tile_size * sm_occupancy * g_sm_count;
+  TestProblem<Backend, InputT, OutputT>(occupancy, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(occupancy + 1, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(occupancy - 1, 1, RANDOM, Sum());
+};
+
+class CustomInputT
+{
+  char m_val{};
+
+public:
+  __host__ __device__ explicit CustomInputT(char val)
+      : m_val(val)
+  {}
+
+  __host__ __device__ int get() const { return static_cast<int>(m_val); }
+};
+
+class CustomAccumulatorT
+{
+  int m_val{0};
+  int m_magic_value{42};
+
+  __host__ __device__ CustomAccumulatorT(int val)
+      : m_val(val)
+  {}
+
+public:
+  __host__ __device__ CustomAccumulatorT()
+  {}
+
+  __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in)
+    : m_val(in.is_valid() * in.get())
+    , m_magic_value(in.is_valid() * 42)
+  {}
+
+  __host__ __device__ void operator=(const CustomInputT &in)
+  {
+    if (this->is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ void operator=(const CustomAccumulatorT &in)
+  {
+    if (this->is_valid() && in.is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ CustomAccumulatorT 
+  operator+(const CustomInputT &in) const
+  {
+    const int multiplier = this->is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ CustomAccumulatorT
+  operator+(const CustomAccumulatorT &in) const
+  {
+    const int multiplier = this->is_valid() && in.is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ int get() const { return m_val; }
+
+  __host__ __device__ bool is_valid() const { return m_magic_value == 42; }
+};
+
+class CustomOutputT
+{
+  bool *m_d_flag{};
+  int m_expected{};
+
+public:
+  __host__ __device__ CustomOutputT(bool *d_flag, int expected)
+      : m_d_flag(d_flag)
+      , m_expected(expected)
+  {}
+
+  __host__ __device__ void operator=(const CustomAccumulatorT &accum) const
+  {
+    *m_d_flag = accum.is_valid() && (accum.get() == m_expected);
+  }
+};
+
+__global__ void InitializeTestAccumulatorTypes(int num_items,
+                                               int expected,
+                                               bool *d_flag,
+                                               CustomInputT *d_in,
+                                               CustomOutputT *d_out)
+{
+  const int idx = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+
+  if (idx < num_items)
+  {
+    d_in[idx] = CustomInputT(1);
+  }
+
+  if (idx == 0)
+  {
+    *d_out = CustomOutputT{d_flag, expected};
+  }
+}
+
+template <typename T, 
+          typename OffsetT>
+void TestBigIndicesHelper(OffsetT num_items)
+{
+  thrust::constant_iterator<T> const_iter(T{1});
+  thrust::device_vector<std::size_t> out(1);
+  std::size_t* d_out = thrust::raw_pointer_cast(out.data());
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  CubDebugExit(
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
+  std::size_t result = out[0];
+
+  AssertEquals(result, num_items);
+}
+
+template <typename T>
+void TestBigIndices()
+{
+  TestBigIndicesHelper<T, std::uint32_t>(1ull << 30);
+  TestBigIndicesHelper<T, std::uint32_t>(1ull << 31);
+  TestBigIndicesHelper<T, std::uint32_t>((1ull << 32) - 1);
+  TestBigIndicesHelper<T, std::uint64_t>(1ull << 33);
+}
+
+#if TEST_TYPES == 3
+void TestAccumulatorTypes()
+{
+  const int num_items  = 2 * 1024 * 1024;
+  const int expected   = num_items;
+  const int block_size = 256;
+  const int grid_size  = (num_items + block_size - 1) / block_size;
+
+  CustomInputT *d_in{};
+  CustomOutputT *d_out{};
+  CustomAccumulatorT init{};
+  bool *d_flag{};
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT)));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_flag, sizeof(bool)));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in,
+                                          sizeof(CustomInputT) * num_items));
+
+  InitializeTestAccumulatorTypes<<<grid_size, block_size>>>(num_items,
+                                                            expected,
+                                                            d_flag,
+                                                            d_in,
+                                                            d_out);
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
+                                         temp_storage_bytes,
+                                         d_in,
+                                         d_out,
+                                         num_items,
+                                         cub::Sum{},
+                                         init));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
+  CubDebugExit(cudaMemset(d_temp_storage, 1,  temp_storage_bytes));
+
+  CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
+                                         temp_storage_bytes,
+                                         d_in,
+                                         d_out,
+                                         num_items,
+                                         cub::Sum{},
+                                         init));
+
+  bool ok{};
+  CubDebugExit(cudaMemcpy(&ok, d_flag, sizeof(bool), cudaMemcpyDeviceToHost));
+
+  AssertTrue(ok);
+
+  CubDebugExit(g_allocator.DeviceFree(d_out));
+  CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+/**
+ * ArgMin should return max value for empty input. This interferes with
+ * input data containing infinity values. This test checks that ArgMin
+ * works correctly with infinity values.
+ */
+void TestFloatInfInArgMin()
+{
+  using in_t     = float;
+  using offset_t = int;
+  using out_t    = cub::KeyValuePair<offset_t, in_t>;
+
+  const int n     = 10;
+  const float inf = ::cuda::std::numeric_limits<float>::infinity();
+
+  thrust::device_vector<in_t> in(n, inf);
+  thrust::device_vector<out_t> out(1);
+
+  const in_t *d_in = thrust::raw_pointer_cast(in.data());
+  out_t *d_out     = thrust::raw_pointer_cast(out.data());
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+  CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
+
+  const out_t result = out[0];
+  AssertEquals(result.key, 0);
+  AssertEquals(result.value, inf);
+}
+
+/**
+ * ArgMax should return lowest value for empty input. This interferes with
+ * input data containing infinity values. This test checks that ArgMax
+ * works correctly with infinity values.
+ */
+void TestFloatInfInArgMax()
+{
+  using in_t = float;
+  using offset_t = int;
+  using out_t = cub::KeyValuePair<offset_t, in_t>;
+
+  const int n = 10;
+  const float inf = ::cuda::std::numeric_limits<float>::infinity();
+  
+  thrust::device_vector<in_t> in(n, -inf);
+  thrust::device_vector<out_t> out(1);
+
+  const in_t *d_in = thrust::raw_pointer_cast(in.data());
+  out_t *d_out = thrust::raw_pointer_cast(out.data());
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};  
+
+  CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+  CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
+
+  const out_t result = out[0];
+  AssertEquals(result.key, 0);
+  AssertEquals(result.value, -inf);
+}
+
+void TestFloatInfInArg()
+{
+  TestFloatInfInArgMin();
+  TestFloatInfInArgMax();
+}
+#endif
+
+template <typename InputT, typename OutputT, typename OffsetT>
+struct GetTileSize
+{
+  OffsetT max_items{};
+  OffsetT max_segments{};
+  OffsetT tile_size{};
+
+  GetTileSize(OffsetT max_items, OffsetT max_segments)
+      : max_items(max_items)
+      , max_segments(max_segments)
+  {}
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION cudaError_t Invoke()
+  {
+    this->tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS *
+                      ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
+    return cudaSuccess;
+  }
+};
+
+/// Test problem type
+template <typename InputT, typename OutputT, typename OffsetT>
+void TestType(OffsetT max_items, OffsetT max_segments)
+{
+  // Inspect the tuning policies to determine this arch's tile size:
+  using MaxPolicyT =
+    typename DeviceReducePolicy<InputT, OffsetT, cub::Sum>::MaxPolicy;
+  GetTileSize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
+  CubDebugExit(MaxPolicyT::Invoke(g_ptx_version, dispatch));
+
+  TestBySize<InputT, OutputT>(max_items, max_segments, dispatch.tile_size);
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    typedef int OffsetT;
+
+    OffsetT max_items       = 27000000;
+    OffsetT max_segments    = 34000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+    args.GetCmdLineArgument("n", max_items);
+    args.GetCmdLineArgument("s", max_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+
+    // Get ptx version
+    CubDebugExit(PtxVersion(g_ptx_version));
+
+    // Get SM count
+    g_sm_count = args.deviceProp.multiProcessorCount;
+
+    // %PARAM% TEST_CDP cdp 0:1
+    // %PARAM% TEST_TYPES types 0:1:2:3:4
+
+#if TEST_TYPES == 0
+    TestType<signed char, signed char>(max_items, max_segments);
+    TestType<unsigned char, unsigned char>(max_items, max_segments);
+    TestType<signed char, int>(max_items, max_segments);
+#elif TEST_TYPES == 1
+    TestType<short, short>(max_items, max_segments);
+    TestType<int, int>(max_items, max_segments);
+    TestType<long, long>(max_items, max_segments);
+    TestType<long long, long long>(max_items, max_segments);
+#elif TEST_TYPES == 2
+    TestType<uchar2, uchar2>(max_items, max_segments);
+    TestType<uint2, uint2>(max_items, max_segments);
+    TestType<ulonglong2, ulonglong2>(max_items, max_segments);
+    TestType<ulonglong4, ulonglong4>(max_items, max_segments);
+#elif TEST_TYPES == 3
+    TestType<TestFoo, TestFoo>(max_items, max_segments);
+    TestAccumulatorTypes();
+    TestFloatInfInArg();
+
+#if TEST_HALF_T
+    TestType<half_t, half_t>(max_items, max_segments);
+#endif
+#else // TEST_TYPES == 4
+    TestType<TestBar, TestBar>(max_items, max_segments);
+    TestBigIndices<std::size_t>();
+
+#if TEST_BF_T
+    TestType<bfloat16_t, bfloat16_t>(max_items, max_segments);
+#endif
+#endif
+    printf("\n");
+    return 0;
+}
+
+
+
diff --git a/include/cub/test/test_device_reduce_by_key.cu b/include/cub/test/test_device_reduce_by_key.cu
new file mode 100644
index 0000000..797b038
--- /dev/null
+++ b/include/cub/test/test_device_reduce_by_key.cu
@@ -0,0 +1,747 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce::ReduceByKey utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "test_util.h"
+
+#include <cstdio>
+#include <typeinfo>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce-by-key entrypoint
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                  /*equality_op*/,
+    ReductionOpT                 reduction_op,
+    OffsetT                     num_items)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::ReduceByKey(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            d_num_runs,
+            reduction_op,
+            num_items);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <int CubBackend,
+          typename KeyInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueInputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumRunsIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  KeyInputIteratorT    d_keys_in,
+                                  KeyOutputIteratorT   d_keys_out,
+                                  ValueInputIteratorT  d_values_in,
+                                  ValueOutputIteratorT d_values_out,
+                                  NumRunsIteratorT     d_num_runs,
+                                  EqualityOpT          equality_op,
+                                  ReductionOpT         reduction_op,
+                                  OffsetT              num_items)
+{
+
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys_in,
+                          d_keys_out,
+                          d_values_in,
+                          d_values_out,
+                          d_num_runs,
+                          equality_op,
+                          reduction_op,
+                          num_items);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename KeyInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueInputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumRunsIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT>
+__forceinline__ cudaError_t
+Dispatch(Int2Type<CDP> /*dispatch_to*/,
+         int          timing_timing_iterations,
+         size_t      *d_temp_storage_bytes,
+         cudaError_t *d_cdp_error,
+
+         void                *d_temp_storage,
+         size_t              &temp_storage_bytes,
+         KeyInputIteratorT    d_keys_in,
+         KeyOutputIteratorT   d_keys_out,
+         ValueInputIteratorT  d_values_in,
+         ValueOutputIteratorT d_values_out,
+         NumRunsIteratorT     d_num_runs,
+         EqualityOpT          equality_op,
+         ReductionOpT         reduction_op,
+         OffsetT              num_items)
+{
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<CUB,
+                              KeyInputIteratorT,
+                              KeyOutputIteratorT,
+                              ValueInputIteratorT,
+                              ValueOutputIteratorT,
+                              NumRunsIteratorT,
+                              EqualityOpT,
+                              ReductionOpT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            d_num_runs,
+            equality_op,
+            reduction_op,
+            num_items);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
+}
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+
+        int repeat;
+
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <
+    typename        KeyInputIteratorT,
+    typename        ValueInputIteratorT,
+    typename        KeyT,
+    typename        ValueT,
+    typename        EqualityOpT,
+    typename        ReductionOpT>
+int Solve(
+    KeyInputIteratorT       h_keys_in,
+    KeyT                    *h_keys_reference,
+    ValueInputIteratorT     h_values_in,
+    ValueT                  *h_values_reference,
+    EqualityOpT             equality_op,
+    ReductionOpT            reduction_op,
+    int                     num_items)
+{
+    using AccumT = cub::detail::accumulator_t<ReductionOpT, ValueT, ValueT>;
+
+    // First item
+    KeyT previous        = h_keys_in[0];
+    AccumT aggregate     = h_values_in[0];
+    int num_segments    = 0;
+
+    // Subsequent items
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (!equality_op(previous, h_keys_in[i]))
+        {
+            h_keys_reference[num_segments] = previous;
+            h_values_reference[num_segments] = static_cast<ValueT>(aggregate);
+            num_segments++;
+            aggregate = h_values_in[i];
+        }
+        else
+        {
+            aggregate = static_cast<ValueT>(reduction_op(aggregate, h_values_in[i]));
+        }
+        previous = h_keys_in[i];
+    }
+
+    h_keys_reference[num_segments] = previous;
+    h_values_reference[num_segments] = static_cast<ValueT>(aggregate);
+    num_segments++;
+
+    return num_segments;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceKeyInputIteratorT,
+    typename            DeviceValueInputIteratorT,
+    typename            KeyT,
+    typename            ValueT,
+    typename            EqualityOpT,
+    typename            ReductionOpT>
+void Test(
+    DeviceKeyInputIteratorT     d_keys_in,
+    DeviceValueInputIteratorT   d_values_in,
+    KeyT*                       h_keys_reference,
+    ValueT*                     h_values_reference,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    int                         num_segments,
+    int                         num_items)
+{
+    // Allocate device output arrays and number of segments
+    KeyT*   d_keys_out             = NULL;
+    ValueT* d_values_out           = NULL;
+    int*    d_num_runs         = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output arrays
+    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
+    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
+    CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
+    printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
+    printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
+
+    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
+    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis  = elapsed_millis / g_timing_iterations;
+        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
+        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
+    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
+    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2 | compare3);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT* h_values_in        = new ValueT[num_items];
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+        InitValue(INTEGER_SEED, h_values_in[i], 1);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        (std::is_same<ReductionOpT, Sum>::value) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    ValueT   *d_values_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_values_in) delete[] h_values_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestIterator(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT one_val;
+    InitValue(INTEGER_SEED, one_val, 1);
+    ConstantInputIterator<ValueT, int> h_values_in(one_val);
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        (std::is_same<ReductionOpT, Sum>::value) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op,
+    int             max_segment)
+{
+    // 0 key-bit entropy reduction rounds
+    TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
+
+    if (max_segment > 1)
+    {
+        // 2 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
+
+        // 7 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
+    }
+}
+
+
+/**
+ * Test different avg segment lengths modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
+
+    // Evaluate different max-segment lengths
+    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
+    {
+        Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
+    }
+}
+
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestDispatch(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+#if TEST_CDP == 0
+    Test<CUB, KeyT, ValueT>(num_items, reduction_op);
+#elif TEST_CDP == 1
+    Test<CDP, KeyT, ValueT>(num_items, reduction_op);
+#endif // TEST_CDP
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestSize(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    if (num_items < 0)
+    {
+        TestDispatch<KeyT, ValueT>(1,        reduction_op);
+        TestDispatch<KeyT, ValueT>(100,      reduction_op);
+        TestDispatch<KeyT, ValueT>(10000,    reduction_op);
+        TestDispatch<KeyT, ValueT>(1000000,  reduction_op);
+    }
+    else
+    {
+        TestDispatch<KeyT, ValueT>(num_items, reduction_op);
+    }
+
+}
+
+
+template <
+    typename        KeyT,
+    typename        ValueT>
+void TestOp(
+    int             num_items)
+{
+    TestSize<KeyT, ValueT>(num_items, cub::Sum());
+    TestSize<KeyT, ValueT>(num_items, cub::Max());
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    printf("\n");
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // %PARAM% TEST_CDP cdp 0:1
+
+    // Test different input types
+    TestOp<int, signed char>(num_items);
+    TestOp<int, short>(num_items);
+    TestOp<int, int>(num_items);
+    TestOp<int, long>(num_items);
+    TestOp<int, long long>(num_items);
+    TestOp<int, float>(num_items);
+    TestOp<int, double>(num_items);
+
+    TestOp<int, uchar2>(num_items);
+    TestOp<int, uint2>(num_items);
+    TestOp<int, uint3>(num_items);
+    TestOp<int, uint4>(num_items);
+    TestOp<int, ulonglong4>(num_items);
+    TestOp<int, TestFoo>(num_items);
+    TestOp<int, TestBar>(num_items);
+
+    TestOp<signed char, int>(num_items);
+    TestOp<long long, int>(num_items);
+    TestOp<TestFoo, int>(num_items);
+    TestOp<TestBar, int>(num_items);
+
+    return 0;
+}
+
+
+
diff --git a/include/cub/test/test_device_run_length_encode.cu b/include/cub/test/test_device_run_length_encode.cu
new file mode 100644
index 0000000..d3fbad6
--- /dev/null
+++ b/include/cub/test/test_device_run_length_encode.cu
@@ -0,0 +1,839 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceRunLengthEncode utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdio>
+#include <typeinfo>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool g_verbose          = false;
+int g_timing_iterations = 0;
+CachingDeviceAllocator g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+  CUB, // CUB method
+  CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+// Operation types
+enum RleMethod
+{
+  RLE, // Run length encode
+  NON_TRIVIAL,
+};
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to run-length encode entrypoint
+ */
+template <typename InputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsIterator,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type<RLE> /*method*/,
+                                                          Int2Type<CUB> /*dispatch_to*/,
+                                                          int timing_timing_iterations,
+                                                          size_t * /*d_temp_storage_bytes*/,
+                                                          cudaError_t * /*d_cdp_error*/,
+
+                                                          void *d_temp_storage,
+                                                          size_t &temp_storage_bytes,
+                                                          InputIteratorT d_in,
+                                                          UniqueOutputIteratorT d_unique_out,
+                                                          OffsetsOutputIteratorT /*d_offsets_out*/,
+                                                          LengthsOutputIteratorT d_lengths_out,
+                                                          NumRunsIterator d_num_runs,
+                                                          cub::Equality /*equality_op*/,
+                                                          OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceRunLengthEncode::Encode(d_temp_storage,
+                                          temp_storage_bytes,
+                                          d_in,
+                                          d_unique_out,
+                                          d_lengths_out,
+                                          d_num_runs,
+                                          num_items);
+  }
+  return error;
+}
+
+/**
+ * Dispatch to non-trivial runs entrypoint
+ */
+template <typename InputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsIterator,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch(Int2Type<NON_TRIVIAL> /*method*/,
+                                                          Int2Type<CUB> /*dispatch_to*/,
+                                                          int timing_timing_iterations,
+                                                          size_t * /*d_temp_storage_bytes*/,
+                                                          cudaError_t * /*d_cdp_error*/,
+
+                                                          void *d_temp_storage,
+                                                          size_t &temp_storage_bytes,
+                                                          InputIteratorT d_in,
+                                                          UniqueOutputIteratorT /*d_unique_out*/,
+                                                          OffsetsOutputIteratorT d_offsets_out,
+                                                          LengthsOutputIteratorT d_lengths_out,
+                                                          NumRunsIterator d_num_runs,
+                                                          cub::Equality /*equality_op*/,
+                                                          OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_in,
+                                                  d_offsets_out,
+                                                  d_lengths_out,
+                                                  d_num_runs,
+                                                  num_items);
+  }
+  return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceRunLengthEncode
+ */
+template <int RLE_METHOD,
+          int CubBackend,
+          typename InputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsIterator,
+          typename EqualityOp,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<RLE_METHOD> method,
+                                  Int2Type<CubBackend> cub_backend,
+                                  int timing_timing_iterations,
+                                  size_t *d_temp_storage_bytes,
+                                  cudaError_t *d_cdp_error,
+
+                                  void *d_temp_storage,
+                                  size_t temp_storage_bytes,
+                                  InputIteratorT d_in,
+                                  UniqueOutputIteratorT d_unique_out,
+                                  OffsetsOutputIteratorT d_offsets_out,
+                                  LengthsOutputIteratorT d_lengths_out,
+                                  NumRunsIterator d_num_runs,
+                                  cub::Equality equality_op,
+                                  OffsetT num_items)
+{
+  *d_cdp_error = Dispatch(method,
+                          cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_unique_out,
+                          d_offsets_out,
+                          d_lengths_out,
+                          d_num_runs,
+                          equality_op,
+                          num_items);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <int RLE_METHOD,
+          typename InputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsIterator,
+          typename EqualityOp,
+          typename OffsetT>
+__forceinline__ cudaError_t Dispatch(Int2Type<RLE_METHOD> method,
+                                     Int2Type<CDP> /*dispatch_to*/,
+                                     int timing_timing_iterations,
+                                     size_t *d_temp_storage_bytes,
+                                     cudaError_t *d_cdp_error,
+
+                                     void *d_temp_storage,
+                                     size_t &temp_storage_bytes,
+                                     InputIteratorT d_in,
+                                     UniqueOutputIteratorT d_unique_out,
+                                     OffsetsOutputIteratorT d_offsets_out,
+                                     LengthsOutputIteratorT d_lengths_out,
+                                     NumRunsIterator d_num_runs,
+                                     EqualityOp equality_op,
+                                     OffsetT num_items)
+{
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+                         .doit(CDPDispatchKernel<RLE_METHOD,
+                                                 CUB,
+                                                 InputIteratorT,
+                                                 UniqueOutputIteratorT,
+                                                 OffsetsOutputIteratorT,
+                                                 LengthsOutputIteratorT,
+                                                 NumRunsIterator,
+                                                 EqualityOp,
+                                                 OffsetT>,
+                               method,
+                               Int2Type<CUB>{},
+                               timing_timing_iterations,
+                               d_temp_storage_bytes,
+                               d_cdp_error,
+                               d_temp_storage,
+                               temp_storage_bytes,
+                               d_in,
+                               d_unique_out,
+                               d_offsets_out,
+                               d_lengths_out,
+                               d_num_runs,
+                               equality_op,
+                               num_items);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+  return retval;
+}
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(int entropy_reduction, T *h_in, int num_items, int max_segment)
+{
+  unsigned int max_int = (unsigned int)-1;
+
+  int key = 0;
+  int i   = 0;
+  while (i < num_items)
+  {
+    // Select number of repeating occurrences for the current run
+    int repeat;
+    if (max_segment < 0)
+    {
+      repeat = num_items;
+    }
+    else if (max_segment < 2)
+    {
+      repeat = 1;
+    }
+    else
+    {
+      RandomBits(repeat, entropy_reduction);
+      repeat = (int)((double(repeat) * double(max_segment)) / double(max_int));
+      repeat = CUB_MAX(1, repeat);
+    }
+
+    int j = i;
+    while (j < CUB_MIN(i + repeat, num_items))
+    {
+      InitValue(INTEGER_SEED, h_in[j], key);
+      j++;
+    }
+
+    i = j;
+    key++;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("\n\n");
+  }
+}
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <RleMethod RLE_METHOD,
+          typename InputIteratorT,
+          typename T,
+          typename OffsetT,
+          typename LengthT,
+          typename EqualityOp>
+int Solve(InputIteratorT h_in,
+          T *h_unique_reference,
+          OffsetT *h_offsets_reference,
+          LengthT *h_lengths_reference,
+          EqualityOp equality_op,
+          int num_items)
+{
+  if (num_items == 0)
+    return 0;
+
+  // First item
+  T previous     = h_in[0];
+  LengthT length = 1;
+  int num_runs   = 0;
+  int run_begin  = 0;
+
+  // Subsequent items
+  for (int i = 1; i < num_items; ++i)
+  {
+    if (!equality_op(previous, h_in[i]))
+    {
+      if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+      {
+        h_unique_reference[num_runs]  = previous;
+        h_offsets_reference[num_runs] = run_begin;
+        h_lengths_reference[num_runs] = length;
+        num_runs++;
+      }
+      length    = 1;
+      run_begin = i;
+    }
+    else
+    {
+      length++;
+    }
+    previous = h_in[i];
+  }
+
+  if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+  {
+    h_unique_reference[num_runs]  = previous;
+    h_offsets_reference[num_runs] = run_begin;
+    h_lengths_reference[num_runs] = length;
+    num_runs++;
+  }
+
+  return num_runs;
+}
+
+/**
+ * Test DeviceRunLengthEncode for a given problem input
+ */
+template <RleMethod RLE_METHOD,
+          Backend BACKEND,
+          typename DeviceInputIteratorT,
+          typename T,
+          typename OffsetT,
+          typename LengthT,
+          typename EqualityOp>
+void Test(DeviceInputIteratorT d_in,
+          T *h_unique_reference,
+          OffsetT *h_offsets_reference,
+          LengthT *h_lengths_reference,
+          EqualityOp equality_op,
+          int num_runs,
+          int num_items)
+{
+  // Allocate device output arrays and number of segments
+  T *d_unique_out        = NULL;
+  LengthT *d_offsets_out = NULL;
+  OffsetT *d_lengths_out = NULL;
+  int *d_num_runs        = NULL;
+
+  if (RLE_METHOD == RLE)
+    CubDebugExit(g_allocator.DeviceAllocate((void **)&d_unique_out, sizeof(T) * num_items));
+  if (RLE_METHOD == NON_TRIVIAL)
+    CubDebugExit(g_allocator.DeviceAllocate((void **)&d_offsets_out, sizeof(OffsetT) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_lengths_out, sizeof(LengthT) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_num_runs, sizeof(int)));
+
+  // Allocate CDP device arrays
+  size_t *d_temp_storage_bytes = NULL;
+  cudaError_t *d_cdp_error     = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_temp_storage_bytes, sizeof(size_t) * 1));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_cdp_error, sizeof(cudaError_t) * 1));
+
+  // Allocate temporary storage
+  void *d_temp_storage      = NULL;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(),
+                        Int2Type<BACKEND>(),
+                        1,
+                        d_temp_storage_bytes,
+                        d_cdp_error,
+                        d_temp_storage,
+                        temp_storage_bytes,
+                        d_in,
+                        d_unique_out,
+                        d_offsets_out,
+                        d_lengths_out,
+                        d_num_runs,
+                        equality_op,
+                        num_items));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Clear device output arrays
+  if (RLE_METHOD == RLE)
+    CubDebugExit(cudaMemset(d_unique_out, 0, sizeof(T) * num_items));
+  if (RLE_METHOD == NON_TRIVIAL)
+    CubDebugExit(cudaMemset(d_offsets_out, 0, sizeof(OffsetT) * num_items));
+  CubDebugExit(cudaMemset(d_lengths_out, 0, sizeof(LengthT) * num_items));
+  CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
+
+  // Run warmup/correctness iteration
+  CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(),
+                        Int2Type<BACKEND>(),
+                        1,
+                        d_temp_storage_bytes,
+                        d_cdp_error,
+                        d_temp_storage,
+                        temp_storage_bytes,
+                        d_in,
+                        d_unique_out,
+                        d_offsets_out,
+                        d_lengths_out,
+                        d_num_runs,
+                        equality_op,
+                        num_items));
+
+  // Check for correctness (and display results, if specified)
+  int compare0 = 0;
+  int compare1 = 0;
+  int compare2 = 0;
+  int compare3 = 0;
+
+  if (RLE_METHOD == RLE)
+  {
+    compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
+    printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
+  }
+
+  if (RLE_METHOD != RLE)
+  {
+    compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
+    printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
+  }
+
+  compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+  printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
+
+  compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+  printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
+
+  // Flush any stdout/stderr
+  fflush(stdout);
+  fflush(stderr);
+
+  // Performance
+  GpuTimer gpu_timer;
+  gpu_timer.Start();
+  CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(),
+                        Int2Type<BACKEND>(),
+                        g_timing_iterations,
+                        d_temp_storage_bytes,
+                        d_cdp_error,
+                        d_temp_storage,
+                        temp_storage_bytes,
+                        d_in,
+                        d_unique_out,
+                        d_offsets_out,
+                        d_lengths_out,
+                        d_num_runs,
+                        equality_op,
+                        num_items));
+  gpu_timer.Stop();
+  float elapsed_millis = gpu_timer.ElapsedMillis();
+
+  // Display performance
+  if (g_timing_iterations > 0)
+  {
+    float avg_millis = elapsed_millis / g_timing_iterations;
+    float giga_rate  = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+    int bytes_moved  = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
+    float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+    printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s",
+           avg_millis,
+           giga_rate,
+           giga_bandwidth);
+  }
+  printf("\n\n");
+
+  // Flush any stdout/stderr
+  fflush(stdout);
+  fflush(stderr);
+
+  // Cleanup
+  if (d_unique_out)
+    CubDebugExit(g_allocator.DeviceFree(d_unique_out));
+  if (d_offsets_out)
+    CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
+  if (d_lengths_out)
+    CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+  if (d_num_runs)
+    CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+  if (d_temp_storage_bytes)
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+  if (d_cdp_error)
+    CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+  if (d_temp_storage)
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+  // Correctness asserts
+  AssertEquals(0, compare0 | compare1 | compare2 | compare3);
+}
+
+/**
+ * Test DeviceRunLengthEncode on pointer type
+ */
+template <RleMethod RLE_METHOD, Backend BACKEND, typename T, typename OffsetT, typename LengthT>
+void TestPointer(int num_items, int entropy_reduction, int max_segment)
+{
+  // Allocate host arrays
+  T *h_in                      = new T[num_items];
+  T *h_unique_reference        = new T[num_items];
+  OffsetT *h_offsets_reference = new OffsetT[num_items];
+  LengthT *h_lengths_reference = new LengthT[num_items];
+
+  for (int i = 0; i < num_items; ++i)
+    InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
+
+  // Initialize problem and solution
+  Equality equality_op;
+  Initialize(entropy_reduction, h_in, num_items, max_segment);
+
+  int num_runs = Solve<RLE_METHOD>(h_in,
+                                   h_unique_reference,
+                                   h_offsets_reference,
+                                   h_lengths_reference,
+                                   equality_op,
+                                   num_items);
+
+  printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, "
+         "%s length}, max_segment %d, entropy_reduction %d\n",
+         (RLE_METHOD == RLE)           ? "DeviceRunLengthEncode::Encode"
+         : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns"
+                                       : "Other",
+         (BACKEND == CDP) ? "CDP CUB" : "CUB",
+         num_items,
+         num_runs,
+         float(num_items) / num_runs,
+         typeid(T).name(),
+         typeid(OffsetT).name(),
+         typeid(LengthT).name(),
+         max_segment,
+         entropy_reduction);
+  fflush(stdout);
+
+  // Allocate problem device arrays
+  T *d_in = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in, sizeof(T) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+  // Run Test
+  Test<RLE_METHOD, BACKEND>(d_in,
+                            h_unique_reference,
+                            h_offsets_reference,
+                            h_lengths_reference,
+                            equality_op,
+                            num_runs,
+                            num_items);
+
+  // Cleanup
+  if (h_in)
+    delete[] h_in;
+  if (h_unique_reference)
+    delete[] h_unique_reference;
+  if (h_offsets_reference)
+    delete[] h_offsets_reference;
+  if (h_lengths_reference)
+    delete[] h_lengths_reference;
+  if (d_in)
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+/**
+ * Test on iterator type
+ */
+template <RleMethod RLE_METHOD, Backend BACKEND, typename T, typename OffsetT, typename LengthT>
+void TestIterator(int num_items, Int2Type<true> /*is_primitive*/)
+{
+  // Allocate host arrays
+  T *h_unique_reference        = new T[num_items];
+  OffsetT *h_offsets_reference = new OffsetT[num_items];
+  LengthT *h_lengths_reference = new LengthT[num_items];
+
+  T one_val;
+  InitValue(INTEGER_SEED, one_val, 1);
+  ConstantInputIterator<T, int> h_in(one_val);
+
+  // Initialize problem and solution
+  Equality equality_op;
+  int num_runs = Solve<RLE_METHOD>(h_in,
+                                   h_unique_reference,
+                                   h_offsets_reference,
+                                   h_lengths_reference,
+                                   equality_op,
+                                   num_items);
+
+  printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s "
+         "offset, %s length}\n",
+         (RLE_METHOD == RLE)           ? "DeviceRunLengthEncode::Encode"
+         : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns"
+                                       : "Other",
+         (BACKEND == CDP) ? "CDP CUB" : "CUB",
+         num_items,
+         num_runs,
+         float(num_items) / num_runs,
+         typeid(T).name(),
+         typeid(OffsetT).name(),
+         typeid(LengthT).name());
+  fflush(stdout);
+
+  // Run Test
+  Test<RLE_METHOD, BACKEND>(h_in,
+                            h_unique_reference,
+                            h_offsets_reference,
+                            h_lengths_reference,
+                            equality_op,
+                            num_runs,
+                            num_items);
+
+  // Cleanup
+  if (h_unique_reference)
+    delete[] h_unique_reference;
+  if (h_offsets_reference)
+    delete[] h_offsets_reference;
+  if (h_lengths_reference)
+    delete[] h_lengths_reference;
+}
+
+template <RleMethod RLE_METHOD, Backend BACKEND, typename T, typename OffsetT, typename LengthT>
+void TestIterator(int /*num_items*/, Int2Type<false> /*is_primitive*/)
+{}
+
+/**
+ * Test different gen modes
+ */
+template <RleMethod RLE_METHOD, Backend BACKEND, typename T, typename OffsetT, typename LengthT>
+void Test(int num_items)
+{
+  // Test iterator (one run)
+  TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items,
+                                                         Int2Type<Traits<T>::PRIMITIVE>());
+
+  // Evaluate different run lengths / segment sizes
+  const int max_seg_limit = CUB_MIN(num_items, 1 << 16);
+  const int max_seg_inc   = 4;
+  for (int max_segment = 1, entropy_reduction = 0; max_segment <= max_seg_limit;
+       max_segment <<= max_seg_inc, entropy_reduction++)
+  {
+    const int max_seg = CUB_MAX(1, max_segment);
+    TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, entropy_reduction, max_seg);
+  }
+}
+
+/**
+ * Test different dispatch
+ */
+template <typename T, typename OffsetT, typename LengthT>
+void TestDispatch(int num_items)
+{
+#if TEST_CDP == 0
+  Test<RLE, CUB, T, OffsetT, LengthT>(num_items);
+  Test<NON_TRIVIAL, CUB, T, OffsetT, LengthT>(num_items);
+#elif TEST_CDP == 1
+  Test<RLE, CDP, T, OffsetT, LengthT>(num_items);
+  Test<NON_TRIVIAL, CDP, T, OffsetT, LengthT>(num_items);
+#endif
+}
+
+/**
+ * Test different input sizes
+ */
+template <typename T, typename OffsetT, typename LengthT>
+void TestSize(int num_items)
+{
+  if (num_items < 0)
+  {
+    TestDispatch<T, OffsetT, LengthT>(0);
+    TestDispatch<T, OffsetT, LengthT>(1);
+    TestDispatch<T, OffsetT, LengthT>(100);
+    TestDispatch<T, OffsetT, LengthT>(10000);
+    TestDispatch<T, OffsetT, LengthT>(1000000);
+  }
+  else
+  {
+    TestDispatch<T, OffsetT, LengthT>(num_items);
+  }
+}
+
+/**
+ * @brief Test with NaNs as input to ensure we don't create an additional (invalid) run on the first
+ * item when (item[0] == item[0]) is false.
+ */
+template <typename T>
+void TestNaNs()
+{
+  using OffsetT = int32_t;
+  using LengthT = int32_t;
+
+  const auto quiet_nan = std::numeric_limits<T>::quiet_NaN();
+
+  // Allocate host data
+  std::vector<T> h_in{quiet_nan, quiet_nan, quiet_nan};
+  const OffsetT num_items      = static_cast<OffsetT>(h_in.size());
+  T *h_unique_reference        = new T[num_items];
+  OffsetT *h_offsets_reference = new OffsetT[num_items];
+  LengthT *h_lengths_reference = new LengthT[num_items];
+
+  // Initialize problem and solution
+  Equality equality_op;
+
+  // Get host-side data for verification
+  int num_runs = Solve<RLE>(h_in.data(),
+                            h_unique_reference,
+                            h_offsets_reference,
+                            h_lengths_reference,
+                            equality_op,
+                            num_items);
+
+  // Allocate problem device arrays
+  thrust::device_vector<T> d_in(num_items);
+  d_in = h_in;
+
+  // Run Test
+  Test<RLE, CUB>(thrust::raw_pointer_cast(d_in.data()),
+                 h_unique_reference,
+                 h_offsets_reference,
+                 h_lengths_reference,
+                 equality_op,
+                 num_runs,
+                 num_items);
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+  int num_items = -1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("i", g_timing_iterations);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--i=<timing iterations> "
+           "[--device=<device-id>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+  printf("\n");
+
+  // %PARAM% TEST_CDP cdp 0:1
+
+  // Run tests with leading/trailing NaNs, where (NaN == NaN) is false
+  TestNaNs<float>();
+  TestNaNs<double>();
+
+  // Test different input types
+  TestSize<signed char, int, int>(num_items);
+  TestSize<short, int, int>(num_items);
+  TestSize<int, int, int>(num_items);
+  TestSize<long, int, int>(num_items);
+  TestSize<long long, int, int>(num_items);
+  TestSize<float, int, int>(num_items);
+  TestSize<double, int, int>(num_items);
+
+  TestSize<uchar2, int, int>(num_items);
+  TestSize<uint2, int, int>(num_items);
+  TestSize<uint3, int, int>(num_items);
+  TestSize<uint4, int, int>(num_items);
+  TestSize<ulonglong4, int, int>(num_items);
+  TestSize<TestFoo, int, int>(num_items);
+  TestSize<TestBar, int, int>(num_items);
+
+  return 0;
+}
diff --git a/include/cub/test/test_device_scan.cu b/include/cub/test/test_device_scan.cu
new file mode 100644
index 0000000..93c6bcc
--- /dev/null
+++ b/include/cub/test/test_device_scan.cu
@@ -0,0 +1,1275 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_scan.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "test_util.h"
+
+#include <cstdio>
+#include <typeinfo>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+double                  g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T, typename U>
+    __host__ __device__ __forceinline__ auto operator()(const T &a, const U &b) const
+      -> decltype(op(a, b))
+    {
+        return static_cast<T>(op(a, b));
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceScan entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /* d_temp_storage_bytes */,
+         cudaError_t * /* d_cdp_error */,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         ScanOpT scan_op,
+         InitialValueT initial_value,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      scan_op,
+                                      initial_value,
+                                      num_items);
+  }
+  return error;
+}
+
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         ScanOpT scan_op,
+         InitialValueT initial_value,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      d_out,
+                                      scan_op,
+                                      initial_value,
+                                      num_items);
+  }
+  return error;
+}
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         Sum /*scan_op*/,
+         InitialValueT /*initial_value*/,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     num_items);
+  }
+  return error;
+}
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename InitialValueT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         Sum /*scan_op*/,
+         InitialValueT /*initial_value*/,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::ExclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     num_items);
+  }
+  return error;
+}
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         ScanOpT scan_op,
+         NullType /* initial_value */,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      scan_op,
+                                      num_items);
+  }
+  return error;
+}
+
+template <typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         IsPrimitiveT /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         ScanOpT scan_op,
+         NullType /*initial_value*/,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveScan(d_temp_storage,
+                                      temp_storage_bytes,
+                                      d_in,
+                                      d_out,
+                                      scan_op,
+                                      num_items);
+  }
+  return error;
+}
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<true> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT /* d_out */,
+         Sum /*scan_op*/,
+         NullType /*initial_value*/,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     num_items);
+  }
+  return error;
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+Dispatch(Int2Type<false> /*in_place*/,
+         Int2Type<CUB> /*dispatch_to*/,
+         Int2Type<true> /*is_primitive*/,
+         int timing_timing_iterations,
+         size_t * /*d_temp_storage_bytes*/,
+         cudaError_t * /*d_cdp_error*/,
+         void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         Sum /*scan_op*/,
+         NullType /*initial_value*/,
+         OffsetT num_items)
+{
+  cudaError_t error = cudaSuccess;
+  for (int i = 0; i < timing_timing_iterations; ++i)
+  {
+    error = DeviceScan::InclusiveSum(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_in,
+                                     d_out,
+                                     num_items);
+  }
+  return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceScan
+ */
+template <typename InPlaceT,
+          typename CubBackendT,
+          typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(InPlaceT     in_place,
+                                  CubBackendT  cub_backend,
+                                  IsPrimitiveT is_primitive,
+                                  int          timing_timing_iterations,
+                                  size_t      *d_temp_storage_bytes,
+                                  cudaError_t *d_cdp_error,
+
+                                  void           *d_temp_storage,
+                                  size_t          temp_storage_bytes,
+                                  InputIteratorT  d_in,
+                                  OutputIteratorT d_out,
+                                  ScanOpT         scan_op,
+                                  InitialValueT   initial_value,
+                                  OffsetT         num_items)
+{
+  *d_cdp_error = Dispatch(in_place,
+                          cub_backend,
+                          is_primitive,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          scan_op,
+                          initial_value,
+                          num_items);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InPlaceT,
+          typename IsPrimitiveT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT>
+cudaError_t Dispatch(InPlaceT      in_place,
+                     Int2Type<CDP> dispatch_to,
+                     IsPrimitiveT  is_primitive,
+                     int           timing_timing_iterations,
+                     size_t       *d_temp_storage_bytes,
+                     cudaError_t  *d_cdp_error,
+
+                     void           *d_temp_storage,
+                     size_t         &temp_storage_bytes,
+                     InputIteratorT  d_in,
+                     OutputIteratorT d_out,
+                     ScanOpT         scan_op,
+                     InitialValueT   initial_value,
+                     OffsetT         num_items)
+{
+  // Invoke kernel to invoke device-side dispatch to CUB backend:
+  (void)dispatch_to;
+  using CubBackendT = Int2Type<CUB>;
+  CubBackendT cub_backend;
+  cudaError_t   retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<InPlaceT,
+                              CubBackendT,
+                              IsPrimitiveT,
+                              InputIteratorT,
+                              OutputIteratorT,
+                              ScanOpT,
+                              InitialValueT,
+                              OffsetT>,
+            in_place,
+            cub_backend,
+            is_primitive,
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            initial_value,
+            num_items);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
+}
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode      gen_mode,
+    T            *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    using AccumT = 
+      cub::detail::accumulator_t<
+        ScanOpT, 
+        InitialValueT, 
+        cub::detail::value_t<InputIteratorT>>;
+
+    if (num_items > 0)
+    {
+        AccumT val         = static_cast<AccumT>(h_in[0]);
+        h_reference[0]     = initial_value;
+        AccumT inclusive   = static_cast<AccumT>(scan_op(initial_value, val));
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            val = static_cast<AccumT>(h_in[i]);
+            h_reference[i] = static_cast<OutputT>(inclusive);
+            inclusive = static_cast<AccumT>(scan_op(inclusive, val));
+        }
+    }
+}
+
+
+/**
+ * Solve inclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    NullType)
+{
+    using AccumT = 
+      cub::detail::accumulator_t<
+        ScanOpT, 
+        cub::detail::value_t<InputIteratorT>, 
+        cub::detail::value_t<InputIteratorT>>;
+
+    if (num_items > 0)
+    {
+        AccumT inclusive    = h_in[0];
+        h_reference[0]      = static_cast<OutputT>(inclusive);
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            AccumT val = h_in[i];
+            inclusive = static_cast<AccumT>(scan_op(inclusive, val));
+            h_reference[i] = static_cast<OutputT>(inclusive);
+        }
+    }
+}
+
+template<typename OutputT, typename DeviceInputIteratorT, bool InPlace>
+struct AllocateOutput {
+    static void run(OutputT *&d_out, DeviceInputIteratorT, int num_items) {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
+    }
+};
+
+template<typename OutputT>
+struct AllocateOutput<OutputT, OutputT *, true> {
+    static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) {
+        d_out = d_in;
+    }
+};
+
+/**
+ * Test DeviceScan for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT,
+    bool                InPlace=false>
+void Test(
+    DeviceInputIteratorT    d_in,
+    OutputT                 *h_reference,
+    int                     num_items,
+    ScanOpT                 scan_op,
+    InitialValueT           initial_value)
+{
+    using InputT = cub::detail::value_t<DeviceInputIteratorT>;
+
+    // Allocate device output array
+    OutputT *d_out = NULL;
+    AllocateOutput<OutputT, DeviceInputIteratorT, InPlace>::run(d_out, d_in, num_items);
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,   sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(
+        Int2Type<InPlace>(),
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    if (!InPlace)
+    {
+      // Clear device output array
+      CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
+    }
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<InPlace>(),
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    if (g_timing_iterations > 0)
+    {
+      GpuTimer gpu_timer;
+      gpu_timer.Start();
+      CubDebugExit(Dispatch(Int2Type<InPlace>(),
+                            Int2Type<BACKEND>(),
+                            Int2Type<Traits<OutputT>::PRIMITIVE>(),
+                            g_timing_iterations,
+                            d_temp_storage_bytes,
+                            d_cdp_error,
+                            d_temp_storage,
+                            temp_storage_bytes,
+                            d_in,
+                            d_out,
+                            scan_op,
+                            initial_value,
+                            num_items));
+      gpu_timer.Stop();
+      float elapsed_millis = gpu_timer.ElapsedMillis();
+
+      // Display performance
+      float avg_millis     = elapsed_millis / g_timing_iterations;
+      float giga_rate      = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+      float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
+      printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% "
+             "peak",
+             avg_millis,
+             giga_rate,
+             giga_bandwidth,
+             giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (!InPlace)
+    {
+      if (d_out)
+      {
+        CubDebugExit(g_allocator.DeviceFree(d_out));
+      }
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+template <typename InitialValueT>
+__global__ void FillInitValue(InitialValueT *ptr, InitialValueT initial_value) {
+    *ptr = initial_value;
+}
+
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT>
+typename std::enable_if<!std::is_same<InitialValueT, cub::NullType>::value>::type
+TestFutureInitValue(
+    DeviceInputIteratorT    d_in,
+    OutputT                 *h_reference,
+    int                     num_items,
+    ScanOpT                 scan_op,
+    InitialValueT           initial_value)
+{
+    // Allocate device initial_value
+    InitialValueT *d_initial_value = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_initial_value, sizeof(InitialValueT)));
+    FillInitValue<<<1, 1>>>(d_initial_value, initial_value);
+
+    // Run test
+    auto future_init_value = cub::FutureValue<InitialValueT>(d_initial_value);
+    Test<BACKEND>(d_in, h_reference, num_items, scan_op, future_init_value);
+
+    // Cleanup
+    if (d_initial_value) CubDebugExit(g_allocator.DeviceFree(d_initial_value));
+}
+
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT>
+typename std::enable_if<std::is_same<InitialValueT, cub::NullType>::value>::type
+TestFutureInitValue(
+    DeviceInputIteratorT,
+    OutputT *,
+    int,
+    ScanOpT,
+    InitialValueT)
+{
+    // cub::NullType does not have device pointer, so nothing to do here
+}
+
+template <
+  Backend             BACKEND,
+  typename            DeviceInputIteratorT,
+  typename            OutputT,
+  typename            ScanOpT,
+  typename            InitialValueT>
+typename std::enable_if<!std::is_same<InitialValueT, cub::NullType>::value>::type
+TestFutureInitValueIter(
+    DeviceInputIteratorT    d_in,
+    OutputT                 *h_reference,
+    int                     num_items,
+    ScanOpT                 scan_op,
+    InitialValueT           initial_value)
+{
+    using IterT = cub::ConstantInputIterator<InitialValueT>;
+    IterT iter(initial_value);
+    auto future_init_value = cub::FutureValue<InitialValueT, IterT>(iter);
+    Test<BACKEND>(d_in, h_reference, num_items, scan_op, future_init_value);
+}
+
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT>
+typename std::enable_if<std::is_same<InitialValueT, cub::NullType>::value>::type
+TestFutureInitValueIter(
+    DeviceInputIteratorT,
+    OutputT *,
+    int,
+    ScanOpT,
+    InitialValueT)
+{
+    // cub::NullType does not have device pointer, so nothing to do here
+}
+
+template <Backend BACKEND,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT>
+void TestInplace(OutputT *d_in,
+                 OutputT *h_reference,
+                 int num_items,
+                 ScanOpT scan_op,
+                 InitialValueT initial_value)
+{
+  Test<BACKEND, OutputT *, OutputT, ScanOpT, InitialValueT, true>(d_in,
+                                                                  h_reference,
+                                                                  num_items,
+                                                                  scan_op,
+                                                                  initial_value);
+}
+
+template <Backend BACKEND,
+          typename DeviceInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT>
+void TestInplace(DeviceInputIteratorT, OutputT *, int, ScanOpT, InitialValueT)
+{}
+
+/**
+ * Test DeviceScan on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestPointer(
+    int             num_items,
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        (std::is_same<InitialValueT, NullType>::value) ? "Inclusive" : "Exclusive",
+        (std::is_same<ScanOpT, Sum>::value) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    InputT*     h_in        = new InputT[num_items];
+    OutputT*    h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Initialize(gen_mode, h_in, num_items);
+
+    // If the output type is primitive and the operator is cub::Sum, the test
+    // dispatcher throws away scan_op and initial_value for exclusive scan.
+    // Without an initial_value arg, the accumulator switches to the input value
+    // type.
+    // Do the same thing here:
+    if (Traits<OutputT>::PRIMITIVE &&
+        std::is_same<ScanOpT, cub::Sum>::value &&
+        !std::is_same<InitialValueT, NullType>::value)
+    {
+      Solve(h_in, h_reference, num_items, cub::Sum{}, InputT{});
+    }
+    else
+    {
+      Solve(h_in, h_reference, num_items, scan_op, initial_value);
+    }
+
+    // Allocate problem device arrays
+    InputT *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
+    TestFutureInitValue<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
+    TestInplace<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceScan on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestIterator(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        (std::is_same<InitialValueT, NullType>::value) ? "Inclusive" : "Exclusive",
+        (std::is_same<ScanOpT, Sum>::value) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT));
+    fflush(stdout);
+
+    // Use a constant iterator as the input
+    InputT val = InputT();
+    ConstantInputIterator<InputT, int> h_in(val);
+
+    // Allocate host arrays
+    OutputT*  h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Solve(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
+    TestFutureInitValueIter<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, UNIFORM, scan_op, initial_value);
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, RANDOM,  scan_op, initial_value);
+    TestIterator<BACKEND, InputT, OutputT>( num_items, scan_op, initial_value);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+#if TEST_CDP == 0
+    Test<CUB, InputT, OutputT>(num_items, scan_op, initial_value);
+#elif TEST_CDP == 1
+    Test<CDP, InputT, OutputT>(num_items, scan_op, initial_value);
+#endif // TEST_CDP
+}
+
+
+/**
+ * Test different operators
+ */
+template <typename InputT, typename OutputT>
+void TestOp(
+    int             num_items,
+    OutputT         identity,
+    OutputT         initial_value)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<InputT, OutputT>(num_items, cub::Sum(), identity);
+    Test<InputT, OutputT>(num_items, cub::Max(), identity);
+
+    // Exclusive (non-specialized, so we can test initial-value)
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Sum>(cub::Sum()), initial_value);
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Max>(cub::Max()), initial_value);
+
+    // Inclusive (no initial value)
+    Test<InputT, OutputT>(num_items, cub::Sum(), NullType());
+    Test<InputT, OutputT>(num_items, cub::Max(), NullType());
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename InputT,
+    typename OutputT>
+void TestSize(
+    int     num_items,
+    OutputT identity,
+    OutputT initial_value)
+{
+    if (num_items < 0)
+    {
+        TestOp<InputT>(0,        identity, initial_value);
+        TestOp<InputT>(1,        identity, initial_value);
+        TestOp<InputT>(100,      identity, initial_value);
+        TestOp<InputT>(10000,    identity, initial_value);
+        TestOp<InputT>(1000000,  identity, initial_value);
+    }
+    else
+    {
+        TestOp<InputT>(num_items, identity, initial_value);
+    }
+}
+
+class CustomInputT
+{
+  char m_val{};
+
+public:
+  __host__ __device__ explicit CustomInputT(char val)
+      : m_val(val)
+  {}
+
+  __host__ __device__ int get() const { return static_cast<int>(m_val); }
+};
+
+class CustomAccumulatorT
+{
+  int m_val{0};
+  int m_magic_value{42};
+
+  __host__ __device__ CustomAccumulatorT(int val)
+      : m_val(val)
+  {}
+
+public:
+  __host__ __device__ CustomAccumulatorT()
+  {}
+
+  __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in)
+    : m_val(in.is_valid() * in.get())
+    , m_magic_value(in.is_valid() * 42)
+  {}
+
+  __host__ __device__ CustomAccumulatorT(const CustomInputT &in)
+    : m_val(in.get())
+    , m_magic_value(42)
+  {}
+
+  __host__ __device__ void operator=(const CustomInputT &in)
+  {
+    if (this->is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ void operator=(const CustomAccumulatorT &in)
+  {
+    if (this->is_valid() && in.is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ CustomAccumulatorT 
+  operator+(const CustomInputT &in) const
+  {
+    const int multiplier = this->is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ CustomAccumulatorT
+  operator+(const CustomAccumulatorT &in) const
+  {
+    const int multiplier = this->is_valid() && in.is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ int get() const { return m_val; }
+
+  __host__ __device__ bool is_valid() const { return m_magic_value == 42; }
+};
+
+class CustomOutputT
+{
+  int *m_d_ok_count{};
+  int m_expected{};
+
+public:
+  __host__ __device__ CustomOutputT(int *d_ok_count, int expected)
+      : m_d_ok_count(d_ok_count)
+      , m_expected(expected)
+  {}
+
+  __device__ void operator=(const CustomAccumulatorT &accum) const
+  {
+    const int ok = accum.is_valid() && (accum.get() == m_expected);
+    atomicAdd(m_d_ok_count, ok);
+  }
+};
+
+__global__ void InitializeTestAccumulatorTypes(int num_items,
+                                               int *d_ok_count,
+                                               CustomInputT *d_in,
+                                               CustomOutputT *d_out)
+{
+  const int idx = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+
+  if (idx < num_items)
+  {
+    d_in[idx] = CustomInputT(1);
+    d_out[idx] = CustomOutputT{d_ok_count, idx};
+  }
+}
+
+void TestAccumulatorTypes()
+{
+  const int num_items  = 2 * 1024 * 1024;
+  const int block_size = 256;
+  const int grid_size  = (num_items + block_size - 1) / block_size;
+
+  CustomInputT *d_in{};
+  CustomOutputT *d_out{};
+  CustomAccumulatorT init{};
+  int *d_ok_count{};
+
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_ok_count, sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_out,
+                                          sizeof(CustomOutputT) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in,
+                                          sizeof(CustomInputT) * num_items));
+
+  InitializeTestAccumulatorTypes<<<grid_size, block_size>>>(num_items,
+                                                            d_ok_count,
+                                                            d_in,
+                                                            d_out);
+
+  std::uint8_t *d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  CubDebugExit(cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_in,
+                                              d_out,
+                                              cub::Sum{},
+                                              init,
+                                              num_items));
+
+  CubDebugExit(
+    g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
+  CubDebugExit(cudaMemset(d_temp_storage, 1, temp_storage_bytes));
+
+  CubDebugExit(cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                                              temp_storage_bytes,
+                                              d_in,
+                                              d_out,
+                                              cub::Sum{},
+                                              init,
+                                              num_items));
+
+  int ok{};
+  CubDebugExit(cudaMemcpy(&ok, d_ok_count, sizeof(int), cudaMemcpyDeviceToHost));
+
+  AssertEquals(ok, num_items);
+
+  CubDebugExit(g_allocator.DeviceFree(d_out));
+  CubDebugExit(g_allocator.DeviceFree(d_in));
+  CubDebugExit(g_allocator.DeviceFree(d_ok_count));
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+    // %PARAM% TEST_CDP cdp 0:1
+    // %PARAM% TEST_VALUE_TYPES types 0:1:2
+
+#if TEST_VALUE_TYPES == 0
+
+    // Test different input+output data types
+    TestSize<unsigned char>(num_items, (int)0, (int)99);
+
+    // Test same input+output data types
+    TestSize<unsigned char>(num_items, (unsigned char)0, (unsigned char)99);
+    TestSize<signed char>(num_items, (char)0, (char)99);
+    TestSize<unsigned short>(num_items, (unsigned short)0, (unsigned short)99);
+    TestSize<unsigned int>(num_items, (unsigned int)0, (unsigned int)99);
+    TestSize<unsigned long long>(num_items,
+                                 (unsigned long long)0,
+                                 (unsigned long long)99);
+
+#elif TEST_VALUE_TYPES == 1
+
+    TestSize<uchar2>(num_items, make_uchar2(0, 0), make_uchar2(17, 21));
+    TestSize<char2>(num_items, make_char2(0, 0), make_char2(17, 21));
+    TestSize<ushort2>(num_items, make_ushort2(0, 0), make_ushort2(17, 21));
+    TestSize<uint2>(num_items, make_uint2(0, 0), make_uint2(17, 21));
+    TestSize<ulonglong2>(num_items,
+                         make_ulonglong2(0, 0),
+                         make_ulonglong2(17, 21));
+    TestSize<uchar4>(num_items,
+                     make_uchar4(0, 0, 0, 0),
+                     make_uchar4(17, 21, 32, 85));
+#elif TEST_VALUE_TYPES == 2
+    TestSize<char4>(num_items,
+                    make_char4(0, 0, 0, 0),
+                    make_char4(17, 21, 32, 85));
+
+    TestSize<ushort4>(num_items,
+                      make_ushort4(0, 0, 0, 0),
+                      make_ushort4(17, 21, 32, 85));
+    TestSize<uint4>(num_items,
+                    make_uint4(0, 0, 0, 0),
+                    make_uint4(17, 21, 32, 85));
+    TestSize<ulonglong4>(num_items,
+                         make_ulonglong4(0, 0, 0, 0),
+                         make_ulonglong4(17, 21, 32, 85));
+
+    TestSize<TestFoo>(num_items,
+                      TestFoo::MakeTestFoo(0, 0, 0, 0),
+                      TestFoo::MakeTestFoo(std::numeric_limits<TestFoo::x_t>::max(),
+                                           std::numeric_limits<TestFoo::y_t>::max(),
+                                           std::numeric_limits<TestFoo::z_t>::max(),
+                                           std::numeric_limits<TestFoo::w_t>::max()));
+
+    TestSize<TestBar>(num_items, 
+                      TestBar(0, 0), 
+                      TestBar(std::numeric_limits<long long>::max(), 
+                              std::numeric_limits<int>::max()));
+
+    TestAccumulatorTypes();
+#endif
+
+    return 0;
+}
+
diff --git a/include/cub/test/test_device_scan_by_key.cu b/include/cub/test/test_device_scan_by_key.cu
new file mode 100644
index 0000000..c1cde25
--- /dev/null
+++ b/include/cub/test/test_device_scan_by_key.cu
@@ -0,0 +1,1099 @@
+/******************************************************************************
+ * Copyright (c) 2021 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_scan.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "test_util.h"
+
+#include <cstdio>
+#include <limits>
+#include <typeinfo>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+double                  g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+enum AliasMode
+{
+  AliasNone,  // output is allocated
+  AliasKeys,  // output is an alias of input keys
+  AliasValues // output is an alias of input values
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T, typename U>
+    __host__ __device__ __forceinline__ auto operator()(const T &a, const U &b) const
+      -> decltype(op(a, b))
+    {
+        return static_cast<T>(op(a, b));
+    }
+};
+
+/**
+ * \brief DivideByFiveFunctor (used by TestIterator)
+ */
+template<typename OutputT>
+struct DivideByFiveFunctor
+{
+    template <typename T>
+    __host__ __device__ __forceinline__ OutputT operator()(const T &a) const
+    {
+        return static_cast<OutputT>(a / 5);
+    }
+};
+
+/**
+ * \brief Mod2Equality (used for non-bool keys to make keys more likely to equal each other)
+ */
+struct Mod2Equality
+{
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return (a % 2) == (b % 2);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceScan entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT, typename EqualityOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>         /*dispatch_to*/,
+    IsPrimitiveT          /*is_primitive*/,
+    int                   timing_timing_iterations,
+    size_t                */*d_temp_storage_bytes*/,
+    cudaError_t           */*d_cdp_error*/,
+
+    void*                 d_temp_storage,
+    size_t&               temp_storage_bytes,
+    KeysInputIteratorT    d_keys_in,
+    ValuesInputIteratorT  d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    ScanOpT               scan_op,
+    InitialValueT         initial_value,
+    OffsetT               num_items,
+    EqualityOpT           equality_op)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, initial_value, num_items, equality_op);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename InitialValueT, typename OffsetT, typename EqualityOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>         /*dispatch_to*/,
+    Int2Type<true>        /*is_primitive*/,
+    int                   timing_timing_iterations,
+    size_t                */*d_temp_storage_bytes*/,
+    cudaError_t           */*d_cdp_error*/,
+
+    void*                 d_temp_storage,
+    size_t&               temp_storage_bytes,
+    KeysInputIteratorT    d_keys_in,
+    ValuesInputIteratorT  d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    Sum                   /*scan_op*/,
+    InitialValueT         /*initial_value*/,
+    OffsetT               num_items,
+    EqualityOpT           equality_op)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, typename OffsetT, typename EqualityOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>         /*dispatch_to*/,
+    IsPrimitiveT          /*is_primitive*/,
+    int                   timing_timing_iterations,
+    size_t                */*d_temp_storage_bytes*/,
+    cudaError_t           */*d_cdp_error*/,
+
+    void*                 d_temp_storage,
+    size_t&               temp_storage_bytes,
+    KeysInputIteratorT    d_keys_in,
+    ValuesInputIteratorT  d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    ScanOpT               scan_op,
+    NullType              /*initial_value*/,
+    OffsetT               num_items,
+    EqualityOpT           equality_op)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename OffsetT, typename EqualityOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>         /*dispatch_to*/,
+    Int2Type<true>        /*is_primitive*/,
+    int                   timing_timing_iterations,
+    size_t                */*d_temp_storage_bytes*/,
+    cudaError_t           */*d_cdp_error*/,
+
+    void*                 d_temp_storage,
+    size_t&               temp_storage_bytes,
+    KeysInputIteratorT    d_keys_in,
+    ValuesInputIteratorT  d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    Sum                   /*scan_op*/,
+    NullType              /*initial_value*/,
+    OffsetT               num_items,
+    EqualityOpT           equality_op)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceScan
+ */
+template <int CubBackend,
+          typename IsPrimitiveT,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename ValuesOutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT,
+          typename EqualityOpT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  IsPrimitiveT         is_primitive,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                 *d_temp_storage,
+                                  size_t                temp_storage_bytes,
+                                  KeysInputIteratorT    d_keys_in,
+                                  ValuesInputIteratorT  d_values_in,
+                                  ValuesOutputIteratorT d_values_out,
+                                  ScanOpT               scan_op,
+                                  InitialValueT         initial_value,
+                                  OffsetT               num_items,
+                                  EqualityOpT           equality_op)
+{
+  *d_cdp_error = Dispatch(cub_backend,
+                          is_primitive,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys_in,
+                          d_values_in,
+                          d_values_out,
+                          scan_op,
+                          initial_value,
+                          num_items,
+                          equality_op);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename IsPrimitiveT,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename ValuesOutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT,
+          typename EqualityOpT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     IsPrimitiveT is_primitive,
+                     int          timing_timing_iterations,
+                     size_t      *d_temp_storage_bytes,
+                     cudaError_t *d_cdp_error,
+
+                     void                 *d_temp_storage,
+                     size_t               &temp_storage_bytes,
+                     KeysInputIteratorT    d_keys_in,
+                     ValuesInputIteratorT  d_values_in,
+                     ValuesOutputIteratorT d_values_out,
+                     ScanOpT               scan_op,
+                     InitialValueT         initial_value,
+                     OffsetT               num_items,
+                     EqualityOpT           equality_op)
+{
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<CUB,
+                              IsPrimitiveT,
+                              KeysInputIteratorT,
+                              ValuesInputIteratorT,
+                              ValuesOutputIteratorT,
+                              ScanOpT,
+                              InitialValueT,
+                              OffsetT,
+                              EqualityOpT>,
+            Int2Type<CUB>{},
+            is_primitive,
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_values_out,
+            scan_op,
+            initial_value,
+            num_items,
+            equality_op);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
+}
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode      gen_mode,
+    T            *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+template <
+    typename        KeysInputIteratorT,
+    typename        ValuesInputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT,
+    typename        EqualityOpT>
+void Solve(
+    KeysInputIteratorT    h_keys_in,
+    ValuesInputIteratorT  h_values_in,
+    OutputT               *h_reference,
+    int                   num_items,
+    ScanOpT               scan_op,
+    InitialValueT         initial_value,
+    EqualityOpT           equality_op)
+{
+    using ValueT = cub::detail::value_t<ValuesInputIteratorT>;
+    using AccumT = cub::detail::accumulator_t<ScanOpT, InitialValueT, ValueT>;
+
+    if (num_items > 0)
+    {
+        for (int i = 0; i < num_items;) {
+            AccumT val         = static_cast<AccumT>(h_values_in[i]);
+            h_reference[i]     = initial_value;
+            AccumT inclusive   = static_cast<AccumT>(scan_op(initial_value, val));
+
+            ++i;
+
+            for (; i < num_items && equality_op(h_keys_in[i - 1], h_keys_in[i]); ++i)
+            {
+                val = static_cast<AccumT>(h_values_in[i]);
+                h_reference[i] = static_cast<OutputT>(inclusive);
+                inclusive = static_cast<AccumT>(scan_op(inclusive, val));
+            }
+        }
+    }
+}
+
+
+/**
+ * Solve inclusive-scan problem
+ */
+template <
+    typename        KeysInputIteratorT,
+    typename        ValuesInputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        EqualityOpT>
+void Solve(
+    KeysInputIteratorT    h_keys_in,
+    ValuesInputIteratorT  h_values_in,
+    OutputT               *h_reference,
+    int                   num_items,
+    ScanOpT               scan_op,
+    NullType              /*initial_value*/,
+    EqualityOpT           equality_op)
+{
+    using ValueT = cub::detail::value_t<ValuesInputIteratorT>;
+    using AccumT = cub::detail::accumulator_t<ScanOpT, ValueT, ValueT>;
+
+    if (num_items > 0)
+    {
+        for (int i = 0; i < num_items;) {
+            AccumT inclusive    = h_values_in[i];
+            h_reference[i]      = static_cast<OutputT>(inclusive);
+
+            ++i;
+
+            for (; i < num_items && equality_op(h_keys_in[i - 1], h_keys_in[i]); ++i)
+            {
+                AccumT val = h_values_in[i];
+                inclusive = static_cast<AccumT>(scan_op(inclusive, val));
+                h_reference[i] = static_cast<OutputT>(inclusive);
+            }
+        }
+    }
+}
+
+template<typename OutputT, typename DeviceInputIteratorT, bool InPlace>
+struct AllocateOutput {
+    static void run(OutputT *&d_out, DeviceInputIteratorT, int num_items) {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
+    }
+};
+
+template<typename OutputT>
+struct AllocateOutput<OutputT, OutputT *, true> {
+    static void run(OutputT *&d_out, OutputT *d_in, int /* num_items */) {
+        d_out = d_in;
+    }
+};
+
+/**
+ * Test DeviceScan for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            KeysInputIteratorT,
+    typename            ValuesInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT,
+    typename            EqualityOpT,
+    AliasMode           Mode=AliasNone>
+void Test(
+    KeysInputIteratorT      d_keys_in,
+    ValuesInputIteratorT    d_values_in,
+    OutputT                 *h_reference,
+    int                     num_items,
+    ScanOpT                 scan_op,
+    InitialValueT           initial_value,
+    EqualityOpT             equality_op)
+{
+    using KeyT = cub::detail::value_t<KeysInputIteratorT>;
+    using InputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+    // Allocate device output array
+    OutputT *d_values_out = NULL;
+
+    if (Mode == AliasKeys)
+    {
+      AllocateOutput<OutputT, KeysInputIteratorT, Mode == AliasKeys>::run(
+        d_values_out,
+        d_keys_in,
+        num_items);
+    }
+    else
+    {
+      AllocateOutput<OutputT, ValuesInputIteratorT, Mode == AliasValues>::run(
+        d_values_out,
+        d_values_in,
+        num_items);
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,   sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys_in,
+        d_values_in,
+        d_values_out,
+        scan_op,
+        initial_value,
+        num_items,
+        equality_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    if (Mode == AliasNone)
+    {
+      CubDebugExit(cudaMemset(d_values_out, 0, sizeof(OutputT) * num_items));
+    }
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys_in,
+        d_values_in,
+        d_values_out,
+        scan_op,
+        initial_value,
+        num_items,
+        equality_op));
+
+    // Check for correctness (and display results, if specified)
+    const int compare = CompareDeviceResults(h_reference,
+                                             d_values_out,
+                                             num_items,
+                                             true,
+                                             g_verbose);
+
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+      // Performance
+      GpuTimer gpu_timer;
+      gpu_timer.Start();
+      CubDebugExit(Dispatch(Int2Type<BACKEND>(),
+          Int2Type<Traits<OutputT>::PRIMITIVE>(),
+          g_timing_iterations,
+          d_temp_storage_bytes,
+          d_cdp_error,
+          d_temp_storage,
+          temp_storage_bytes,
+          d_keys_in,
+          d_values_in,
+          d_values_out,
+          scan_op,
+          initial_value,
+          num_items,
+          equality_op));
+
+      gpu_timer.Stop();
+      float elapsed_millis = gpu_timer.ElapsedMillis();
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
+            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (Mode == AliasNone)
+    {
+      if (d_values_out) 
+      {
+        CubDebugExit(g_allocator.DeviceFree(d_values_out));
+      }
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+template <Backend BACKEND,
+          typename KeysInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+void TestInplaceValues(KeysInputIteratorT d_keys_in,
+                       OutputT *d_values_in,
+                       OutputT *h_reference,
+                       int num_items,
+                       ScanOpT scan_op,
+                       InitialValueT initial_value,
+                       EqualityOpT equality_op)
+{
+  Test<BACKEND,
+       KeysInputIteratorT,
+       OutputT *,
+       OutputT,
+       ScanOpT,
+       InitialValueT,
+       EqualityOpT,
+       AliasValues>(d_keys_in,
+                    d_values_in,
+                    h_reference,
+                    num_items,
+                    scan_op,
+                    initial_value,
+                    equality_op);
+}
+
+template <Backend BACKEND,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+void TestInplaceValues(KeysInputIteratorT,
+                       ValuesInputIteratorT,
+                       OutputT *,
+                       int,
+                       ScanOpT,
+                       InitialValueT,
+                       EqualityOpT)
+{}
+
+template <Backend BACKEND,
+          typename T,
+          typename ValuesInputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+void TestInplaceKeys(T *d_keys_in,
+                     ValuesInputIteratorT d_values_in,
+                     T *h_reference,
+                     int num_items,
+                     ScanOpT scan_op,
+                     InitialValueT initial_value,
+                     EqualityOpT equality_op)
+{
+  Test<BACKEND,
+       T *,
+       ValuesInputIteratorT,
+       T,
+       ScanOpT,
+       InitialValueT,
+       EqualityOpT,
+       AliasKeys>(d_keys_in,
+                  d_values_in,
+                  h_reference,
+                  num_items,
+                  scan_op,
+                  initial_value,
+                  equality_op);
+}
+
+template <Backend BACKEND,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename OutputT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename EqualityOpT>
+void TestInplaceKeys(KeysInputIteratorT,
+                     ValuesInputIteratorT,
+                     OutputT *,
+                     int,
+                     ScanOpT,
+                     InitialValueT,
+                     EqualityOpT)
+{}
+
+/**
+ * Test DeviceScan on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT,
+    typename        EqualityOpT>
+void TestPointer(
+    int             num_items,
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value,
+    EqualityOpT     equality_op)
+{
+    printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        (std::is_same<InitialValueT, NullType>::value) ? "Inclusive" : "Exclusive",
+        (std::is_same<ScanOpT, Sum>::value) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    KeyT*       h_keys_in   = new KeyT[num_items];
+    InputT*     h_values_in = new InputT[num_items];
+    OutputT*    h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Initialize(gen_mode, h_keys_in, num_items);
+    Initialize(gen_mode, h_values_in, num_items);
+
+    // If the output type is primitive and the operator is cub::Sum, the test
+    // dispatcher throws away scan_op and initial_value for exclusive scan.
+    // Without an initial_value arg, the accumulator switches to the input value
+    // type.
+    // Do the same thing here:
+    if (Traits<OutputT>::PRIMITIVE &&
+        std::is_same<ScanOpT, cub::Sum>::value &&
+        !std::is_same<InitialValueT, NullType>::value)
+    {
+      Solve(h_keys_in, h_values_in, h_reference, num_items, cub::Sum{}, InputT{}, equality_op);
+    }
+    else
+    {
+      Solve(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op);
+    }
+
+    // Allocate problem device arrays
+    KeyT *d_keys_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+    InputT *d_values_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(InputT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in,
+                  d_values_in,
+                  h_reference,
+                  num_items,
+                  scan_op,
+                  initial_value,
+                  equality_op);
+
+    // Test in/out values aliasing
+    TestInplaceValues<BACKEND>(d_keys_in,
+                               d_values_in, 
+                               h_reference,
+                               num_items,
+                               scan_op,
+                               initial_value,
+                               equality_op);
+
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    // Test keys/values aliasing (should go last, changes keys)
+    TestInplaceKeys<BACKEND>(d_keys_in,
+                             d_values_in,
+                             h_reference,
+                             num_items,
+                             scan_op,
+                             initial_value,
+                             equality_op);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_values_in) delete[] h_values_in;
+    if (h_reference) delete[] h_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
+}
+
+
+/**
+ * Test DeviceScan on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT,
+    typename        EqualityOpT>
+void TestIterator(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value,
+    EqualityOpT     equality_op)
+{
+    printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        (std::is_same<InitialValueT, NullType>::value) ? "Inclusive" : "Exclusive",
+        (std::is_same<ScanOpT, Sum>::value) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT));
+    fflush(stdout);
+
+    // Use a counting iterator followed by div as the keys
+    using CountingIterT = CountingInputIterator<int, int>;
+    CountingIterT h_keys_in_helper(0);
+    TransformInputIterator<KeyT, DivideByFiveFunctor<KeyT>, CountingIterT> h_keys_in(h_keys_in_helper, DivideByFiveFunctor<KeyT>());
+
+    // Use a constant iterator as the input
+    InputT val = InputT();
+    ConstantInputIterator<InputT, int> h_values_in(val);
+
+    // Allocate host arrays
+    OutputT*  h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Solve(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op);
+
+    // Run Test
+    Test<BACKEND>(h_keys_in, h_values_in, h_reference, num_items, scan_op, initial_value, equality_op);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT,
+    typename        EqualityOpT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value,
+    EqualityOpT     equality_op)
+{
+    TestPointer<BACKEND, KeyT, InputT, OutputT>(  num_items, UNIFORM, scan_op, initial_value, equality_op);
+    TestPointer<BACKEND, KeyT, InputT, OutputT>(  num_items, RANDOM,  scan_op, initial_value, equality_op);
+    TestIterator<BACKEND, KeyT, InputT, OutputT>( num_items, scan_op, initial_value, equality_op);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        KeyT,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT,
+    typename        EqualityOpT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value,
+    EqualityOpT     equality_op)
+{
+#if TEST_CDP == 0
+    Test<CUB, KeyT, InputT, OutputT>(num_items, scan_op, initial_value, equality_op);
+#elif TEST_CDP == 1
+    Test<CDP, KeyT, InputT, OutputT>(num_items, scan_op, initial_value, equality_op);
+#endif // TEST_CDP
+}
+
+
+/**
+ * Test different operators
+ */
+template <typename KeyT, typename InputT, typename OutputT, typename EqualityOpT>
+void TestOp(
+    int             num_items,
+    OutputT         identity,
+    OutputT         initial_value,
+    EqualityOpT     equality_op)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<KeyT, InputT, OutputT>(num_items, cub::Sum(), identity, equality_op);
+    Test<KeyT, InputT, OutputT>(num_items, cub::Max(), identity, equality_op);
+
+    // Exclusive (non-specialized, so we can test initial-value)
+    Test<KeyT, InputT, OutputT>(num_items, WrapperFunctor<cub::Sum>(cub::Sum()), initial_value, equality_op);
+    Test<KeyT, InputT, OutputT>(num_items, WrapperFunctor<cub::Max>(cub::Max()), initial_value, equality_op);
+
+    // Inclusive (no initial value)
+    Test<KeyT, InputT, OutputT>(num_items, cub::Sum(), NullType(), equality_op);
+    Test<KeyT, InputT, OutputT>(num_items, cub::Max(), NullType(), equality_op);
+}
+
+/**
+ * Test different key type and equality operator
+ */
+template <typename InputT, typename OutputT>
+void TestKeyTAndEqualityOp(
+    int             num_items,
+    OutputT         identity,
+    OutputT         initial_value)
+{
+    TestOp<bool, InputT>(num_items, identity, initial_value, Equality());
+    TestOp<unsigned int, InputT>( num_items, identity, initial_value, Mod2Equality());
+}
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename InputT,
+    typename OutputT>
+void TestSize(
+    int         num_items,
+    OutputT     identity,
+    OutputT     initial_value)
+{
+    if (num_items < 0)
+    {
+        TestKeyTAndEqualityOp<InputT>(0,        identity, initial_value);
+        TestKeyTAndEqualityOp<InputT>(1,        identity, initial_value);
+        TestKeyTAndEqualityOp<InputT>(100,      identity, initial_value);
+        TestKeyTAndEqualityOp<InputT>(10000,    identity, initial_value);
+        TestKeyTAndEqualityOp<InputT>(1000000,  identity, initial_value);
+    }
+    else
+    {
+        TestKeyTAndEqualityOp<InputT>(num_items, identity, initial_value);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+    // %PARAM% TEST_CDP cdp 0:1
+    // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5
+
+#if TEST_VALUE_TYPES == 0
+
+    // Test different input+output data types
+    TestSize<unsigned char>(num_items, (int)0, (int)99);
+
+    // Test same input+output data types
+    TestSize<unsigned char>(num_items, (unsigned char)0, (unsigned char)99);
+    TestSize<signed char>(num_items, (char)0, (char)99);
+
+#elif TEST_VALUE_TYPES == 1
+
+    TestSize<unsigned short>(num_items, (unsigned short)0, (unsigned short)99);
+    TestSize<unsigned int>(num_items, (unsigned int)0, (unsigned int)99);
+    TestSize<unsigned long long>(num_items,
+                                 (unsigned long long)0,
+                                 (unsigned long long)99);
+#elif TEST_VALUE_TYPES == 2
+
+    TestSize<uchar2>(num_items, make_uchar2(0, 0), make_uchar2(17, 21));
+    TestSize<char2>(num_items, make_char2(0, 0), make_char2(17, 21));
+    TestSize<ushort2>(num_items, make_ushort2(0, 0), make_ushort2(17, 21));
+
+#elif TEST_VALUE_TYPES == 3
+
+    TestSize<uint2>(num_items, make_uint2(0, 0), make_uint2(17, 21));
+    TestSize<ulonglong2>(num_items,
+                         make_ulonglong2(0, 0),
+                         make_ulonglong2(17, 21));
+    TestSize<uchar4>(num_items,
+                     make_uchar4(0, 0, 0, 0),
+                     make_uchar4(17, 21, 32, 85));
+
+#elif TEST_VALUE_TYPES == 4
+
+    TestSize<char4>(num_items,
+                    make_char4(0, 0, 0, 0),
+                    make_char4(17, 21, 32, 85));
+
+    TestSize<ushort4>(num_items,
+                      make_ushort4(0, 0, 0, 0),
+                      make_ushort4(17, 21, 32, 85));
+    TestSize<uint4>(num_items,
+                    make_uint4(0, 0, 0, 0),
+                    make_uint4(17, 21, 32, 85));
+
+#elif TEST_VALUE_TYPES == 5
+
+    TestSize<ulonglong4>(num_items,
+                         make_ulonglong4(0, 0, 0, 0),
+                         make_ulonglong4(17, 21, 32, 85));
+
+    TestSize<TestFoo>(num_items,
+                      TestFoo::MakeTestFoo(0, 0, 0, 0),
+                      TestFoo::MakeTestFoo(std::numeric_limits<TestFoo::x_t>::max(),
+                                           std::numeric_limits<TestFoo::y_t>::max(),
+                                           std::numeric_limits<TestFoo::z_t>::max(),
+                                           std::numeric_limits<TestFoo::w_t>::max()));
+
+    TestSize<TestBar>(num_items, 
+                      TestBar(0, 0), 
+                      TestBar(std::numeric_limits<long long>::max(), 
+                              std::numeric_limits<int>::max()));
+
+#endif
+
+    return 0;
+}
diff --git a/include/cub/test/test_device_segmented_sort.cu b/include/cub/test/test_device_segmented_sort.cu
new file mode 100644
index 0000000..a2882ea
--- /dev/null
+++ b/include/cub/test/test_device_segmented_sort.cu
@@ -0,0 +1,1946 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_segmented_sort.cuh>
+
+#include <nv/target>
+
+#include <test_util.h>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+
+#include <fstream>
+
+#define TEST_HALF_T !_NVHPC_CUDA
+
+#define TEST_BF_T !_NVHPC_CUDA
+
+#if TEST_HALF_T
+#include <cuda_fp16.h>
+#endif
+
+#if TEST_BF_T
+#include <cuda_bf16.h>
+#endif
+
+using namespace cub;
+
+template <typename T>
+struct UnwrapHalfAndBfloat16
+{
+  using Type = T;
+};
+
+#if TEST_HALF_T
+template <>
+struct UnwrapHalfAndBfloat16<half_t>
+{
+  using Type = __half;
+};
+#endif
+
+#if TEST_BF_T
+template <>
+struct UnwrapHalfAndBfloat16<bfloat16_t>
+{
+  using Type = __nv_bfloat16;
+};
+#endif
+
+constexpr static int MAX_ITERATIONS = 2;
+
+
+class SizeGroupDescription
+{
+public:
+  SizeGroupDescription(const int segments,
+                       const int segment_size)
+      : segments(segments)
+      , segment_size(segment_size)
+  {}
+
+  int segments {};
+  int segment_size {};
+};
+
+template <typename KeyT>
+struct SegmentChecker
+{
+  const KeyT *sorted_keys {};
+  const int *offsets {};
+
+  SegmentChecker(const KeyT *sorted_keys,
+                 const int *offsets)
+    : sorted_keys(sorted_keys)
+    , offsets(offsets)
+  {}
+
+  bool operator()(int segment_id)
+  {
+    const int segment_begin = offsets[segment_id];
+    const int segment_end = offsets[segment_id + 1];
+
+    int counter = 0;
+    for (int i = segment_begin; i < segment_end; i++)
+    {
+      if (sorted_keys[i] != static_cast<KeyT>(counter++))
+      {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+template <typename KeyT>
+struct DescendingSegmentChecker
+{
+  const KeyT *sorted_keys{};
+  const int *offsets{};
+
+  DescendingSegmentChecker(const KeyT *sorted_keys,
+                           const int *offsets)
+      : sorted_keys(sorted_keys)
+      , offsets(offsets)
+  {}
+
+  bool operator()(int segment_id)
+  {
+    const int segment_begin = offsets[segment_id];
+    const int segment_end   = offsets[segment_id + 1];
+
+    int counter = 0;
+    for (int i = segment_end - 1; i >= segment_begin; i--)
+    {
+      if (sorted_keys[i] != static_cast<KeyT>(counter++))
+      {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+template <typename KeyT>
+struct ReversedIota
+{
+  KeyT *data {};
+  const int *offsets {};
+
+  ReversedIota(KeyT *data,
+               const int *offsets)
+    : data(data)
+    , offsets(offsets)
+  {}
+
+  void operator()(int segment_id) const
+  {
+    const int segment_begin = offsets[segment_id];
+    const int segment_end = offsets[segment_id + 1];
+    const int segment_size = segment_end - segment_begin;
+
+    int count = 0;
+    for (int i = segment_begin; i < segment_end; i++)
+    {
+      data[i] = static_cast<KeyT>(segment_size - 1 - count++);
+    }
+  }
+};
+
+
+template <typename KeyT>
+struct Iota
+{
+  KeyT *data{};
+  const int *offsets{};
+
+  Iota(KeyT *data, const int *offsets)
+      : data(data)
+      , offsets(offsets)
+  {}
+
+  void operator()(int segment_id) const
+  {
+    const int segment_begin = offsets[segment_id];
+    const int segment_end   = offsets[segment_id + 1];
+
+    int count = 0;
+    for (int i = segment_begin; i < segment_end; i++)
+    {
+      data[i] = static_cast<KeyT>(count++);
+    }
+  }
+};
+
+
+template <typename KeyT,
+          typename ValueT = cub::NullType>
+class Input
+{
+  thrust::default_random_engine random_engine;
+  thrust::device_vector<int> d_segment_sizes;
+  thrust::device_vector<int> d_offsets;
+  thrust::host_vector<int> h_offsets;
+
+  using MaskedValueT = cub::detail::conditional_t<
+    std::is_same<ValueT, cub::NullType>::value, KeyT, ValueT>;
+
+  bool reverse {};
+  int num_items {};
+  thrust::device_vector<KeyT> d_keys;
+  thrust::device_vector<MaskedValueT> d_values;
+  thrust::host_vector<KeyT> h_keys;
+  thrust::host_vector<MaskedValueT> h_values;
+
+public:
+  Input(bool reverse, const thrust::host_vector<int> &h_segment_sizes)
+      : d_segment_sizes(h_segment_sizes)
+      , d_offsets(d_segment_sizes.size() + 1)
+      , h_offsets(d_segment_sizes.size() + 1)
+      , reverse(reverse)
+      , num_items(static_cast<int>(
+          thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end())))
+      , d_keys(num_items)
+      , d_values(num_items)
+      , h_keys(num_items)
+      , h_values(num_items)
+  {
+    update();
+  }
+
+  Input(thrust::host_vector<int> &h_offsets)
+    : d_offsets(h_offsets)
+    , h_offsets(h_offsets)
+    , reverse(false)
+    , num_items(h_offsets.back())
+    , d_keys(num_items)
+    , d_values(num_items)
+  {
+  }
+
+  void shuffle()
+  {
+    thrust::shuffle(d_segment_sizes.begin(), d_segment_sizes.end(), random_engine);
+
+    update();
+  }
+
+  int get_num_items() const
+  {
+    return num_items;
+  }
+
+  int get_num_segments() const
+  {
+    return static_cast<unsigned int>(d_segment_sizes.size());
+  }
+
+  const KeyT *get_d_keys() const
+  {
+    return thrust::raw_pointer_cast(d_keys.data());
+  }
+
+  thrust::device_vector<KeyT> &get_d_keys_vec()
+  {
+    return d_keys;
+  }
+
+  thrust::device_vector<MaskedValueT> &get_d_values_vec()
+  {
+    return d_values;
+  }
+
+  KeyT *get_d_keys()
+  {
+    return thrust::raw_pointer_cast(d_keys.data());
+  }
+
+  const thrust::host_vector<int>& get_h_offsets()
+  {
+    return h_offsets;
+  }
+
+  MaskedValueT *get_d_values()
+  {
+    return thrust::raw_pointer_cast(d_values.data());
+  }
+
+  const int *get_d_offsets() const
+  {
+    return thrust::raw_pointer_cast(d_offsets.data());
+  }
+
+  template <typename T>
+  bool check_output_implementation(const T *keys_output)
+  {
+    const int *offsets = thrust::raw_pointer_cast(h_offsets.data());
+
+    if (reverse)
+    {
+      DescendingSegmentChecker<T> checker{keys_output, offsets};
+
+      for (int i = 0; i < get_num_segments(); i++)
+      {
+        if (!checker(i))
+        {
+          return false;
+        }
+      }
+    }
+    else
+    {
+      SegmentChecker<T> checker{keys_output, offsets};
+
+      for (int i = 0; i < get_num_segments(); i++)
+      {
+        if (!checker(i))
+        {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool check_output(const KeyT *d_keys_output,
+                    const MaskedValueT *d_values_output = nullptr)
+  {
+    KeyT *keys_output = thrust::raw_pointer_cast(h_keys.data());
+    MaskedValueT *values_output = thrust::raw_pointer_cast(h_values.data());
+
+    cudaMemcpy(keys_output,
+               d_keys_output,
+               sizeof(KeyT) * num_items,
+               cudaMemcpyDeviceToHost);
+
+    const bool keys_ok = check_output_implementation(keys_output);
+
+    if (std::is_same<ValueT, cub::NullType>::value || d_values_output == nullptr)
+    {
+      return keys_ok;
+    }
+
+    cudaMemcpy(values_output,
+               d_values_output,
+               sizeof(ValueT) * num_items,
+               cudaMemcpyDeviceToHost);
+
+    const bool values_ok = check_output_implementation(values_output);
+
+    return keys_ok && values_ok;
+  }
+
+private:
+  void update()
+  {
+    fill_offsets();
+    gen_keys();
+  }
+
+  void fill_offsets()
+  {
+    thrust::copy(d_segment_sizes.begin(), d_segment_sizes.end(), d_offsets.begin());
+    thrust::exclusive_scan(d_offsets.begin(), d_offsets.end(), d_offsets.begin(), 0u);
+    thrust::copy(d_offsets.begin(), d_offsets.end(), h_offsets.begin());
+  }
+
+  void gen_keys()
+  {
+    KeyT *keys_output = thrust::raw_pointer_cast(h_keys.data());
+    const int *offsets = thrust::raw_pointer_cast(h_offsets.data());
+
+    if (reverse)
+    {
+      Iota<KeyT> generator{keys_output, offsets};
+
+      for (int i = 0; i < get_num_segments(); i++)
+      {
+        generator(i);
+      }
+    }
+    else
+    {
+      ReversedIota<KeyT> generator{keys_output, offsets};
+
+      for (int i = 0; i < get_num_segments(); i++)
+      {
+        generator(i);
+      }
+    }
+
+    d_keys = h_keys;
+    d_values = d_keys;
+  }
+};
+
+template <typename KeyT,
+          bool IsIntegralType = std::is_integral<KeyT>::value>
+class InputDescription
+{
+  thrust::host_vector<int> segment_sizes;
+
+public:
+  InputDescription& add(const SizeGroupDescription &group)
+  {
+    if (static_cast<std::size_t>(group.segment_size) <
+        static_cast<std::size_t>((std::numeric_limits<KeyT>::max)()))
+    {
+      for (int i = 0; i < group.segments; i++)
+      {
+        segment_sizes.push_back(group.segment_size);
+      }
+    }
+
+    return *this;
+  }
+
+  template <typename ValueT = cub::NullType>
+  Input<KeyT, ValueT> gen(bool reverse)
+  {
+    return Input<KeyT, ValueT>(reverse, segment_sizes);
+  }
+};
+
+template <typename KeyT>
+class InputDescription<KeyT, false>
+{
+  thrust::host_vector<int> segment_sizes;
+
+public:
+  InputDescription& add(const SizeGroupDescription &group)
+  {
+    for (int i = 0; i < group.segments; i++)
+    {
+      segment_sizes.push_back(group.segment_size);
+    }
+
+    return *this;
+  }
+
+  template <typename ValueT = cub::NullType>
+  Input<KeyT, ValueT> gen(bool reverse)
+  {
+    return Input<KeyT, ValueT>(reverse, segment_sizes);
+  }
+};
+
+
+template <typename WrappedKeyT,
+          typename ValueT>
+void Sort(bool pairs,
+          bool descending,
+          bool double_buffer,
+          bool stable_sort,
+
+          void *tmp_storage,
+          std::size_t &temp_storage_bytes,
+
+          WrappedKeyT *wrapped_input_keys,
+          WrappedKeyT *wrapped_output_keys,
+
+          ValueT *input_values,
+          ValueT *output_values,
+
+          int num_items,
+          int num_segments,
+          const int *d_offsets,
+
+          int *keys_selector = nullptr,
+          int *values_selector = nullptr)
+{
+  using KeyT = typename UnwrapHalfAndBfloat16<WrappedKeyT>::Type;
+
+  auto input_keys = reinterpret_cast<KeyT*>(wrapped_input_keys);
+  auto output_keys = reinterpret_cast<KeyT*>(wrapped_output_keys);
+
+  if (stable_sort)
+  {
+    if (pairs)
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          CubDebugExit(cub::DeviceSegmentedSort::StableSortPairsDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            keys_buffer,
+            values_buffer,
+            num_items,
+            num_segments,
+            d_offsets,
+            d_offsets + 1));
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(cub::DeviceSegmentedSort::StableSortPairsDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            input_values,
+            output_values,
+            num_items,
+            num_segments,
+            d_offsets,
+            d_offsets + 1));
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          CubDebugExit(
+            cub::DeviceSegmentedSort::StableSortPairs(tmp_storage,
+                                                      temp_storage_bytes,
+                                                      keys_buffer,
+                                                      values_buffer,
+                                                      num_items,
+                                                      num_segments,
+                                                      d_offsets,
+                                                      d_offsets + 1));
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(
+            cub::DeviceSegmentedSort::StableSortPairs(tmp_storage,
+                                                      temp_storage_bytes,
+                                                      input_keys,
+                                                      output_keys,
+                                                      input_values,
+                                                      output_values,
+                                                      num_items,
+                                                      num_segments,
+                                                      d_offsets,
+                                                      d_offsets + 1));
+        }
+      }
+    }
+    else
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          CubDebugExit(cub::DeviceSegmentedSort::StableSortKeysDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            keys_buffer,
+            num_items,
+            num_segments,
+            d_offsets,
+            d_offsets + 1));
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(cub::DeviceSegmentedSort::StableSortKeysDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            num_items,
+            num_segments,
+            d_offsets,
+            d_offsets + 1));
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          CubDebugExit(
+            cub::DeviceSegmentedSort::StableSortKeys(tmp_storage,
+                                                     temp_storage_bytes,
+                                                     keys_buffer,
+                                                     num_items,
+                                                     num_segments,
+                                                     d_offsets,
+                                                     d_offsets + 1));
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(
+            cub::DeviceSegmentedSort::StableSortKeys(tmp_storage,
+                                                     temp_storage_bytes,
+                                                     input_keys,
+                                                     output_keys,
+                                                     num_items,
+                                                     num_segments,
+                                                     d_offsets,
+                                                     d_offsets + 1));
+        }
+      }
+    }
+  }
+  else
+  {
+    if (pairs)
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          CubDebugExit(
+            cub::DeviceSegmentedSort::SortPairsDescending(tmp_storage,
+                                                          temp_storage_bytes,
+                                                          keys_buffer,
+                                                          values_buffer,
+                                                          num_items,
+                                                          num_segments,
+                                                          d_offsets,
+                                                          d_offsets + 1));
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(
+            cub::DeviceSegmentedSort::SortPairsDescending(tmp_storage,
+                                                          temp_storage_bytes,
+                                                          input_keys,
+                                                          output_keys,
+                                                          input_values,
+                                                          output_values,
+                                                          num_items,
+                                                          num_segments,
+                                                          d_offsets,
+                                                          d_offsets + 1));
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          CubDebugExit(cub::DeviceSegmentedSort::SortPairs(tmp_storage,
+                                                           temp_storage_bytes,
+                                                           keys_buffer,
+                                                           values_buffer,
+                                                           num_items,
+                                                           num_segments,
+                                                           d_offsets,
+                                                           d_offsets + 1));
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(cub::DeviceSegmentedSort::SortPairs(tmp_storage,
+                                                           temp_storage_bytes,
+                                                           input_keys,
+                                                           output_keys,
+                                                           input_values,
+                                                           output_values,
+                                                           num_items,
+                                                           num_segments,
+                                                           d_offsets,
+                                                           d_offsets + 1));
+        }
+      }
+    }
+    else
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          CubDebugExit(
+            cub::DeviceSegmentedSort::SortKeysDescending(tmp_storage,
+                                                         temp_storage_bytes,
+                                                         keys_buffer,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_offsets,
+                                                         d_offsets + 1));
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(
+            cub::DeviceSegmentedSort::SortKeysDescending(tmp_storage,
+                                                         temp_storage_bytes,
+                                                         input_keys,
+                                                         output_keys,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_offsets,
+                                                         d_offsets + 1));
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          CubDebugExit(cub::DeviceSegmentedSort::SortKeys(tmp_storage,
+                                                          temp_storage_bytes,
+                                                          keys_buffer,
+                                                          num_items,
+                                                          num_segments,
+                                                          d_offsets,
+                                                          d_offsets + 1));
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          CubDebugExit(cub::DeviceSegmentedSort::SortKeys(tmp_storage,
+                                                          temp_storage_bytes,
+                                                          input_keys,
+                                                          output_keys,
+                                                          num_items,
+                                                          num_segments,
+                                                          d_offsets,
+                                                          d_offsets + 1));
+        }
+      }
+    }
+  }
+}
+
+template <typename KeyT,
+          typename ValueT>
+std::size_t Sort(bool pairs,
+                 bool descending,
+                 bool double_buffer,
+                 bool stable_sort,
+
+                 KeyT *input_keys,
+                 KeyT *output_keys,
+
+                 ValueT *input_values,
+                 ValueT *output_values,
+
+                 int num_items,
+                 int num_segments,
+                 const int *d_offsets,
+
+                 int *keys_selector   = nullptr,
+                 int *values_selector = nullptr)
+{
+  std::size_t temp_storage_bytes = 42ul;
+
+  Sort<KeyT, ValueT>(pairs,
+                     descending,
+                     double_buffer,
+                     stable_sort,
+                     nullptr,
+                     temp_storage_bytes,
+                     input_keys,
+                     output_keys,
+                     input_values,
+                     output_values,
+                     num_items,
+                     num_segments,
+                     d_offsets,
+                     keys_selector,
+                     values_selector);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  Sort<KeyT, ValueT>(pairs,
+                     descending,
+                     double_buffer,
+                     stable_sort,
+                     d_temp_storage,
+                     temp_storage_bytes,
+                     input_keys,
+                     output_keys,
+                     input_values,
+                     output_values,
+                     num_items,
+                     num_segments,
+                     d_offsets,
+                     keys_selector,
+                     values_selector);
+
+  return temp_storage_bytes;
+}
+
+
+constexpr bool keys_only = false;
+constexpr bool pairs = true;
+
+constexpr bool ascending = false;
+constexpr bool descending = true;
+
+constexpr bool pointers = false;
+constexpr bool double_buffer = true;
+
+constexpr bool unstable = false;
+constexpr bool stable = true;
+
+
+void TestZeroSegments()
+{
+  // Type doesn't affect the escape logic, so it should be fine
+  // to test only one set of types here.
+
+  using KeyT = std::uint8_t;
+  using ValueT = std::uint64_t;
+
+  for (bool stable_sort: { unstable, stable })
+  {
+    for (bool sort_pairs: { keys_only, pairs })
+    {
+      for (bool sort_descending: { ascending, descending })
+      {
+        for (bool sort_buffer: { pointers, double_buffer })
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(nullptr, nullptr);
+          cub::DoubleBuffer<ValueT> values_buffer(nullptr, nullptr);
+          values_buffer.selector = 1;
+
+          Sort<KeyT, ValueT>(sort_pairs,
+                             sort_descending,
+                             sort_buffer,
+                             stable_sort,
+                             nullptr,
+                             nullptr,
+                             nullptr,
+                             nullptr,
+                             int{},
+                             int{},
+                             nullptr,
+                             &keys_buffer.selector,
+                             &values_buffer.selector);
+
+          AssertEquals(keys_buffer.selector, 0);
+          AssertEquals(values_buffer.selector, 1);
+        }
+      }
+    }
+  }
+}
+
+
+void TestEmptySegments(int segments)
+{
+  // Type doesn't affect the escape logic, so it should be fine
+  // to test only one set of types here.
+
+  using KeyT = std::uint8_t;
+  using ValueT = std::uint64_t;
+
+  thrust::device_vector<int> offsets(segments + 1, int{});
+  const int *d_offsets = thrust::raw_pointer_cast(offsets.data());
+
+  for (bool sort_stable: { unstable, stable })
+  {
+    for (bool sort_pairs: { keys_only, pairs })
+    {
+      for (bool sort_descending: { ascending, descending })
+      {
+        for (bool sort_buffer: { pointers, double_buffer })
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(nullptr, nullptr);
+          cub::DoubleBuffer<ValueT> values_buffer(nullptr, nullptr);
+          values_buffer.selector = 1;
+
+          Sort<KeyT, ValueT>(sort_pairs,
+                             sort_descending,
+                             sort_buffer,
+                             sort_stable,
+                             nullptr,
+                             nullptr,
+                             nullptr,
+                             nullptr,
+                             int{},
+                             segments,
+                             d_offsets,
+                             &keys_buffer.selector,
+                             &values_buffer.selector);
+
+          AssertEquals(keys_buffer.selector, 0);
+          AssertEquals(values_buffer.selector, 1);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename KeyT,
+          typename ValueT>
+void TestSameSizeSegments(int segment_size,
+                          int segments,
+                          bool skip_values = false)
+{
+  const int num_items = segment_size * segments;
+
+  thrust::device_vector<int> offsets(segments + 1);
+  thrust::sequence(offsets.begin(),
+                   offsets.end(),
+                   int{},
+                   segment_size);
+
+  const int *d_offsets = thrust::raw_pointer_cast(offsets.data());
+
+  const KeyT target_key {1};
+  const ValueT target_value {42};
+
+  thrust::device_vector<KeyT> keys_input(num_items);
+  thrust::device_vector<KeyT> keys_output(num_items);
+
+  KeyT *d_keys_input  = thrust::raw_pointer_cast(keys_input.data());
+  KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data());
+
+  thrust::device_vector<ValueT> values_input(num_items);
+  thrust::device_vector<ValueT> values_output(num_items);
+
+  thrust::host_vector<KeyT> host_keys(num_items);
+  thrust::host_vector<ValueT> host_values(num_items);
+
+  ValueT *d_values_input  = thrust::raw_pointer_cast(values_input.data());
+  ValueT *d_values_output = thrust::raw_pointer_cast(values_output.data());
+
+  for (bool stable_sort: { unstable, stable })
+  {
+    for (bool sort_pairs: { keys_only, pairs })
+    {
+      if (sort_pairs)
+      {
+        if (skip_values)
+        {
+          continue;
+        }
+      }
+
+      for (bool sort_descending: { ascending, descending })
+      {
+        for (bool sort_buffers: { pointers, double_buffer })
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(nullptr, nullptr);
+          cub::DoubleBuffer<ValueT> values_buffer(nullptr, nullptr);
+          values_buffer.selector = 1;
+
+          thrust::fill(keys_input.begin(), keys_input.end(), target_key);
+          thrust::fill(keys_output.begin(), keys_output.end(), KeyT{});
+
+          if (sort_pairs)
+          {
+            if (sort_buffers)
+            {
+              thrust::fill(values_input.begin(), values_input.end(), ValueT{});
+              thrust::fill(values_output.begin(), values_output.end(), target_value);
+            }
+            else
+            {
+              thrust::fill(values_input.begin(), values_input.end(), target_value);
+              thrust::fill(values_output.begin(), values_output.end(), ValueT{});
+            }
+          }
+
+          const std::size_t temp_storage_bytes =
+            Sort<KeyT, ValueT>(sort_pairs,
+                               sort_descending,
+                               sort_buffers,
+                               stable_sort,
+                               d_keys_input,
+                               d_keys_output,
+                               d_values_input,
+                               d_values_output,
+                               num_items,
+                               segments,
+                               d_offsets,
+                               &keys_buffer.selector,
+                               &values_buffer.selector);
+
+          // If temporary storage size is defined by extra keys storage
+          if (sort_buffers)
+          {
+            if (2 * segments * sizeof(unsigned int) < num_items * sizeof(KeyT))
+            {
+              std::size_t extra_temp_storage_bytes{};
+
+              Sort(sort_pairs,
+                   sort_descending,
+                   pointers,
+                   stable_sort,
+                   nullptr,
+                   extra_temp_storage_bytes,
+                   d_keys_input,
+                   d_keys_output,
+                   d_values_input,
+                   d_values_output,
+                   num_items,
+                   segments,
+                   d_offsets,
+                   &keys_buffer.selector,
+                   &values_buffer.selector);
+
+              AssertTrue(extra_temp_storage_bytes > temp_storage_bytes);
+            }
+          }
+
+          {
+            host_keys = keys_buffer.selector || !sort_buffers ? keys_output
+                                                              : keys_input;
+            const std::size_t items_selected =
+              thrust::count(host_keys.begin(), host_keys.end(), target_key);
+            AssertEquals(static_cast<int>(items_selected), num_items);
+          }
+
+          if (sort_pairs)
+          {
+            host_values = values_buffer.selector || !sort_buffers
+                            ? values_output
+                            : values_input;
+            const std::size_t items_selected =
+              thrust::count(host_values.begin(),
+                            host_values.end(),
+                            target_value);
+
+            AssertEquals(static_cast<int>(items_selected), num_items);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+template <typename KeyT,
+          typename ValueT>
+void InputTest(bool sort_descending,
+               Input<KeyT, ValueT> &input)
+{
+  thrust::device_vector<KeyT> keys_output(input.get_num_items());
+  KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data());
+
+  thrust::device_vector<ValueT> values_output(input.get_num_items());
+  ValueT *d_values_output = thrust::raw_pointer_cast(values_output.data());
+
+  for (bool stable_sort: { unstable, stable })
+  {
+    for (bool sort_pairs : { keys_only, pairs })
+    {
+      for (bool sort_buffers : {pointers, double_buffer})
+      {
+        for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++)
+        {
+          thrust::fill(keys_output.begin(), keys_output.end(), KeyT{});
+          thrust::fill(values_output.begin(), values_output.end(), ValueT{});
+
+          cub::DoubleBuffer<KeyT> keys_buffer(input.get_d_keys(),
+                                              d_keys_output);
+          cub::DoubleBuffer<ValueT> values_buffer(input.get_d_values(),
+                                                  d_values_output);
+
+          Sort<KeyT, ValueT>(sort_pairs,
+                             sort_descending,
+                             sort_buffers,
+                             stable_sort,
+                             input.get_d_keys(),
+                             d_keys_output,
+                             input.get_d_values(),
+                             d_values_output,
+                             input.get_num_items(),
+                             input.get_num_segments(),
+                             input.get_d_offsets(),
+                             &keys_buffer.selector,
+                             &values_buffer.selector);
+
+          if (sort_buffers)
+          {
+            if (sort_pairs)
+            {
+              AssertTrue(input.check_output(keys_buffer.Current(),
+                                            values_buffer.Current()));
+            }
+            else
+            {
+              AssertTrue(input.check_output(keys_buffer.Current()));
+            }
+          }
+          else
+          {
+            if (sort_pairs)
+            {
+              AssertTrue(input.check_output(d_keys_output, d_values_output));
+            }
+            else
+            {
+              AssertTrue(input.check_output(d_keys_output));
+            }
+          }
+
+          input.shuffle();
+        }
+      }
+    }
+  }
+}
+
+struct ComparisonPredicate
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs == rhs;
+  }
+
+  __host__ __device__ bool operator()(const half_t &lhs, const half_t &rhs) const
+  {
+    return lhs.raw() == rhs.raw();
+  }
+};
+
+template <typename T>
+bool compare_two_outputs(const thrust::host_vector<int> &offsets,
+                         const thrust::host_vector<T> &lhs,
+                         const thrust::host_vector<T> &rhs)
+{
+  const auto num_segments = static_cast<unsigned int>(offsets.size() - 1);
+
+  for (std::size_t segment_id = 0; segment_id < num_segments; segment_id++)
+  {
+    auto lhs_begin = lhs.cbegin() + offsets[segment_id];
+    auto lhs_end = lhs.cbegin() + offsets[segment_id + 1];
+    auto rhs_begin = rhs.cbegin() + offsets[segment_id];
+
+    auto err = thrust::mismatch(lhs_begin, lhs_end, rhs_begin, ComparisonPredicate{});
+
+    if (err.first != lhs_end)
+    {
+      const auto idx = thrust::distance(lhs_begin, err.first);
+      const auto segment_size = std::distance(lhs_begin, lhs_end);
+
+      std::cerr << "Mismatch in segment " << segment_id
+                << " at position " << idx << " / " << segment_size
+                << ": "
+                << static_cast<std::uint64_t>(lhs_begin[idx]) << " vs "
+                << static_cast<std::uint64_t>(rhs_begin[idx]) << " ("
+                << typeid(lhs_begin[idx]).name() << ")" << std::endl;
+
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename ValueT>
+void RandomizeInput(thrust::host_vector<bool> &h_keys,
+                    thrust::host_vector<ValueT> &h_values)
+{
+  for (std::size_t i = 0; i < h_keys.size(); i++)
+  {
+    h_keys[i] = RandomValue((std::numeric_limits<std::uint8_t>::max)()) > 128;
+    h_values[i] = RandomValue((std::numeric_limits<ValueT>::max)());
+  }
+}
+
+template <typename KeyT,
+          typename ValueT>
+void RandomizeInput(thrust::host_vector<KeyT> &h_keys,
+                    thrust::host_vector<ValueT> &h_values)
+{
+  for (std::size_t i = 0; i < h_keys.size(); i++)
+  {
+    h_keys[i] = RandomValue((std::numeric_limits<KeyT>::max)());
+    h_values[i] = RandomValue((std::numeric_limits<ValueT>::max)());
+  }
+}
+
+#if TEST_HALF_T
+void RandomizeInput(thrust::host_vector<half_t> &h_keys,
+                    thrust::host_vector<std::uint32_t> &h_values)
+{
+  for (std::size_t i = 0; i < h_keys.size(); i++)
+  {
+    h_keys[i] = RandomValue((std::numeric_limits<int>::max)());
+    h_values[i] = RandomValue((std::numeric_limits<std::uint32_t>::max)());
+  }
+}
+#endif
+
+#if TEST_BF_T
+void RandomizeInput(thrust::host_vector<bfloat16_t> &h_keys,
+                    thrust::host_vector<std::uint32_t> &h_values)
+{
+  for (std::size_t i = 0; i < h_keys.size(); i++)
+  {
+    h_keys[i] = RandomValue((std::numeric_limits<int>::max)());
+    h_values[i] = RandomValue((std::numeric_limits<std::uint32_t>::max)());
+  }
+}
+#endif
+
+
+
+template <typename KeyT,
+          typename ValueT>
+void HostReferenceSort(bool sort_pairs,
+                       bool sort_descending,
+                       unsigned int num_segments,
+                       const thrust::host_vector<int> &h_offsets,
+                       thrust::host_vector<KeyT> &h_keys,
+                       thrust::host_vector<ValueT> &h_values)
+{
+  for (unsigned int segment_i = 0;
+       segment_i < num_segments;
+       segment_i++)
+  {
+    const int segment_begin = h_offsets[segment_i];
+    const int segment_end   = h_offsets[segment_i + 1];
+
+    if (sort_pairs)
+    {
+      if (sort_descending)
+      {
+        thrust::stable_sort_by_key(h_keys.begin() + segment_begin,
+                                   h_keys.begin() + segment_end,
+                                   h_values.begin() + segment_begin,
+                                   thrust::greater<KeyT>{});
+      }
+      else
+      {
+        thrust::stable_sort_by_key(h_keys.begin() + segment_begin,
+                                   h_keys.begin() + segment_end,
+                                   h_values.begin() + segment_begin);
+      }
+    }
+    else
+    {
+      if (sort_descending)
+      {
+        thrust::stable_sort(h_keys.begin() + segment_begin,
+                            h_keys.begin() + segment_end,
+                            thrust::greater<KeyT>{});
+      }
+      else
+      {
+        thrust::stable_sort(h_keys.begin() + segment_begin,
+                            h_keys.begin() + segment_end);
+      }
+    }
+  }
+}
+
+
+#if STORE_ON_FAILURE
+template <typename KeyT,
+          typename ValueT>
+void DumpInput(bool sort_pairs,
+               bool sort_descending,
+               bool sort_buffers,
+               Input<KeyT, ValueT> &input,
+               thrust::host_vector<KeyT> &h_keys,
+               thrust::host_vector<ValueT> &h_values)
+{
+  const thrust::host_vector<int> &h_offsets = input.get_h_offsets();
+
+  std::cout << "sort pairs: " << sort_pairs << "\n";
+  std::cout << "sort descending: " << sort_descending << "\n";
+  std::cout << "sort buffers: " << sort_buffers << "\n";
+  std::cout << "num_items: " << input.get_num_items() << "\n";
+  std::cout << "num_segments: " << input.get_num_segments() << "\n";
+  std::cout << "key type: " << typeid(h_keys[0]).name() << "\n";
+  std::cout << "value type: " << typeid(h_values[0]).name() << "\n";
+  std::cout << "offset type: " << typeid(h_offsets[0]).name() << "\n";
+
+  std::ofstream offsets_dump("offsets", std::ios::binary);
+  offsets_dump.write(reinterpret_cast<const char *>(
+                       thrust::raw_pointer_cast(h_offsets.data())),
+                     sizeof(int) * h_offsets.size());
+
+  std::ofstream keys_dump("keys", std::ios::binary);
+  keys_dump.write(reinterpret_cast<const char *>(
+                    thrust::raw_pointer_cast(h_keys.data())),
+                  sizeof(KeyT) * h_keys.size());
+
+  std::ofstream values_dump("values", std::ios::binary);
+  values_dump.write(reinterpret_cast<const char *>(
+                      thrust::raw_pointer_cast(h_values.data())),
+                    sizeof(ValueT) * h_values.size());
+}
+#endif
+
+
+template <typename KeyT,
+          typename ValueT>
+void InputTestRandom(Input<KeyT, ValueT> &input)
+{
+  thrust::host_vector<KeyT> h_keys_output(input.get_num_items());
+  thrust::device_vector<KeyT> keys_output(input.get_num_items());
+
+  thrust::host_vector<ValueT> h_values_output(input.get_num_items());
+  thrust::device_vector<ValueT> values_output(input.get_num_items());
+
+  KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data());
+  ValueT *d_values_output = thrust::raw_pointer_cast(values_output.data());
+
+  thrust::host_vector<KeyT> h_keys(input.get_num_items());
+  thrust::host_vector<ValueT> h_values(input.get_num_items());
+
+  const thrust::host_vector<int> &h_offsets = input.get_h_offsets();
+
+  for (bool stable_sort: { unstable, stable })
+  {
+    for (bool sort_pairs: { keys_only, pairs })
+    {
+      for (bool sort_descending: { ascending, descending })
+      {
+        for (bool sort_buffers: { pointers, double_buffer })
+        {
+          for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++)
+          {
+            RandomizeInput(h_keys, h_values);
+
+#if STORE_ON_FAILURE
+            auto h_keys_backup = h_keys;
+            auto h_values_backup = h_values;
+#endif
+
+            input.get_d_keys_vec()   = h_keys;
+            input.get_d_values_vec() = h_values;
+
+            cub::DoubleBuffer<KeyT> keys_buffer(input.get_d_keys(), d_keys_output);
+            cub::DoubleBuffer<ValueT> values_buffer(input.get_d_values(), d_values_output);
+
+            Sort<KeyT, ValueT>(sort_pairs,
+                               sort_descending,
+                               sort_buffers,
+                               stable_sort,
+                               input.get_d_keys(),
+                               d_keys_output,
+                               input.get_d_values(),
+                               d_values_output,
+                               input.get_num_items(),
+                               input.get_num_segments(),
+                               input.get_d_offsets(),
+                               &keys_buffer.selector,
+                               &values_buffer.selector);
+
+            HostReferenceSort(sort_pairs,
+                              sort_descending,
+                              input.get_num_segments(),
+                              h_offsets,
+                              h_keys,
+                              h_values);
+
+            if (sort_buffers)
+            {
+              if (keys_buffer.selector)
+              {
+                h_keys_output = keys_output;
+              }
+              else
+              {
+                h_keys_output = input.get_d_keys_vec();
+              }
+
+              if (values_buffer.selector)
+              {
+                h_values_output = values_output;
+              }
+              else
+              {
+                h_values_output = input.get_d_values_vec();
+              }
+            }
+            else
+            {
+              h_keys_output = keys_output;
+              h_values_output = values_output;
+            }
+
+            const bool keys_ok =
+              compare_two_outputs(h_offsets, h_keys, h_keys_output);
+
+            const bool values_ok =
+              sort_pairs
+                ? compare_two_outputs(h_offsets, h_values, h_values_output)
+                : true;
+
+#if STORE_ON_FAILURE
+            if (!keys_ok || !values_ok)
+            {
+              DumpInput<KeyT, ValueT>(sort_pairs,
+                                      sort_descending,
+                                      sort_buffers,
+                                      input,
+                                      h_keys_backup,
+                                      h_values_backup);
+            }
+#endif
+
+            AssertTrue(keys_ok);
+            AssertTrue(values_ok);
+
+            input.shuffle();
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename KeyT,
+          typename ValueT,
+          bool IsSupportedType = std::is_integral<KeyT>::value>
+struct EdgeTestDispatch
+{
+  // Edge cases that needs to be tested
+  const int empty_short_circuit_segment_size = 0;
+  const int copy_short_circuit_segment_size = 1;
+  const int swap_short_circuit_segment_size = 2;
+
+  const int a_few = 2;
+  const int a_bunch_of = 42;
+  const int a_lot_of = 420;
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    NV_IF_TARGET(NV_IS_HOST,
+      (using SmallAndMediumPolicyT =
+         typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+       using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+
+       const int small_segment_max_segment_size =
+         SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE;
+
+       const int items_per_small_segment =
+         SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_THREAD;
+
+       const int medium_segment_max_segment_size =
+         SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE;
+
+       const int single_thread_segment_size = items_per_small_segment;
+
+       const int large_cached_segment_max_segment_size =
+         LargeSegmentPolicyT::BLOCK_THREADS *
+         LargeSegmentPolicyT::ITEMS_PER_THREAD;
+
+       for (bool sort_descending : {ascending, descending}) {
+         Input<KeyT, ValueT> edge_cases =
+           InputDescription<KeyT>()
+             .add({a_lot_of, empty_short_circuit_segment_size})
+             .add({a_lot_of, copy_short_circuit_segment_size})
+             .add({a_lot_of, swap_short_circuit_segment_size})
+             .add({a_lot_of, swap_short_circuit_segment_size + 1})
+             .add({a_lot_of, swap_short_circuit_segment_size + 1})
+             .add({a_lot_of, single_thread_segment_size - 1})
+             .add({a_lot_of, single_thread_segment_size})
+             .add({a_lot_of, single_thread_segment_size + 1})
+             .add({a_lot_of, single_thread_segment_size * 2 - 1})
+             .add({a_lot_of, single_thread_segment_size * 2})
+             .add({a_lot_of, single_thread_segment_size * 2 + 1})
+             .add({a_bunch_of, small_segment_max_segment_size - 1})
+             .add({a_bunch_of, small_segment_max_segment_size})
+             .add({a_bunch_of, small_segment_max_segment_size + 1})
+             .add({a_bunch_of, medium_segment_max_segment_size - 1})
+             .add({a_bunch_of, medium_segment_max_segment_size})
+             .add({a_bunch_of, medium_segment_max_segment_size + 1})
+             .add({a_bunch_of, large_cached_segment_max_segment_size - 1})
+             .add({a_bunch_of, large_cached_segment_max_segment_size})
+             .add({a_bunch_of, large_cached_segment_max_segment_size + 1})
+             .add({a_few, large_cached_segment_max_segment_size * 2})
+             .add({a_few, large_cached_segment_max_segment_size * 3})
+             .add({a_few, large_cached_segment_max_segment_size * 5})
+             .template gen<ValueT>(sort_descending);
+
+         InputTest<KeyT, ValueT>(sort_descending, edge_cases);
+       }));
+
+    return cudaSuccess;
+  }
+};
+
+template <typename KeyT,
+          typename ValueT>
+struct EdgeTestDispatch<KeyT, ValueT, false>
+{
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    // Edge case test is using an optimized testing approach which is
+    // incompatible with duplicates. RandomTest is used for other types.
+    return cudaSuccess;
+  }
+};
+
+template <typename KeyT,
+          typename ValueT>
+void EdgePatternsTest()
+{
+  int ptx_version = 0;
+  if (CubDebug(PtxVersion(ptx_version)))
+  {
+    return;
+  }
+
+  using MaxPolicyT =
+    typename cub::DeviceSegmentedSortPolicy<KeyT, ValueT>::MaxPolicy;
+  using EdgeTestDispatchT = EdgeTestDispatch<KeyT, ValueT>;
+  EdgeTestDispatchT dispatch;
+
+  MaxPolicyT::Invoke(ptx_version, dispatch);
+
+}
+
+template <typename KeyT,
+          typename ValueT>
+Input<KeyT, ValueT> GenRandomInput(int max_items,
+                                   int min_segments,
+                                   int max_segments,
+                                   bool descending)
+{
+  int items_generated {};
+  const int segments_num = RandomValue(max_segments) + min_segments;
+
+  thrust::host_vector<int> segment_sizes;
+  segment_sizes.reserve(segments_num);
+
+  const int max_segment_size = 6000;
+
+  for (int segment_id = 0; segment_id < segments_num; segment_id++)
+  {
+    const int segment_size_raw = RandomValue(max_segment_size);
+    const int segment_size     = segment_size_raw > 0 ? segment_size_raw : 0;
+
+    if (segment_size + items_generated > max_items)
+    {
+      break;
+    }
+
+    items_generated += segment_size;
+    segment_sizes.push_back(segment_size);
+  }
+
+  return Input<KeyT, ValueT>{descending, segment_sizes};
+}
+
+template <typename KeyT,
+          typename ValueT>
+void RandomTest(int min_segments,
+                int max_segments)
+{
+  const int max_items = 10000000;
+
+  for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++)
+  {
+    Input<KeyT, ValueT> edge_cases = GenRandomInput<KeyT, ValueT>(max_items,
+                                                                  min_segments,
+                                                                  max_segments,
+                                                                  descending);
+
+    InputTestRandom(edge_cases);
+  }
+}
+
+
+template <typename KeyT,
+          typename ValueT>
+void Test()
+{
+  for (int segment_size: { 1, 1024, 24 * 1024 })
+  {
+    for (int segments: { 1, 1024 })
+    {
+      TestSameSizeSegments<KeyT, ValueT>(segment_size, segments);
+    }
+  }
+
+  RandomTest<KeyT, ValueT>(1 << 2, 1 << 8);
+  RandomTest<KeyT, ValueT>(1 << 9, 1 << 19);
+  EdgePatternsTest<KeyT, ValueT>();
+}
+
+
+#if TEST_CDP == 1
+template <typename KeyT>
+__global__ void LauncherKernel(
+    void *tmp_storage,
+    std::size_t temp_storage_bytes,
+    const KeyT *in_keys,
+    KeyT *out_keys,
+    int num_items,
+    int num_segments,
+    const int *offsets)
+{
+  CubDebug(cub::DeviceSegmentedSort::SortKeys(tmp_storage,
+                                              temp_storage_bytes,
+                                              in_keys,
+                                              out_keys,
+                                              num_items,
+                                              num_segments,
+                                              offsets,
+                                              offsets + 1));
+}
+
+template <typename KeyT,
+          typename ValueT>
+void TestDeviceSideLaunch(Input<KeyT, ValueT> &input)
+{
+  thrust::host_vector<KeyT> h_keys_output(input.get_num_items());
+  thrust::device_vector<KeyT> keys_output(input.get_num_items());
+
+  thrust::host_vector<ValueT> h_values_output(input.get_num_items());
+  thrust::device_vector<ValueT> values_output(input.get_num_items());
+
+  KeyT *d_keys_output = thrust::raw_pointer_cast(keys_output.data());
+
+  thrust::host_vector<KeyT> h_keys(input.get_num_items());
+  thrust::host_vector<ValueT> h_values(input.get_num_items());
+
+  const thrust::host_vector<int> &h_offsets = input.get_h_offsets();
+
+  for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++)
+  {
+    RandomizeInput(h_keys, h_values);
+
+    input.get_d_keys_vec()   = h_keys;
+    input.get_d_values_vec() = h_values;
+
+    const KeyT *d_input = input.get_d_keys();
+
+    std::size_t temp_storage_bytes{};
+    cub::DeviceSegmentedSort::SortKeys(nullptr,
+                                       temp_storage_bytes,
+                                       d_input,
+                                       d_keys_output,
+                                       input.get_num_items(),
+                                       input.get_num_segments(),
+                                       input.get_d_offsets(),
+                                       input.get_d_offsets() + 1);
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    LauncherKernel<KeyT><<<1, 1>>>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input,
+      d_keys_output,
+      input.get_num_items(),
+      input.get_num_segments(),
+      input.get_d_offsets());
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(cudaPeekAtLastError());
+
+    HostReferenceSort(false,
+                      false,
+                      input.get_num_segments(),
+                      h_offsets,
+                      h_keys,
+                      h_values);
+
+    h_keys_output = keys_output;
+
+    const bool keys_ok =
+      compare_two_outputs(h_offsets, h_keys, h_keys_output);
+
+    AssertTrue(keys_ok);
+
+    input.shuffle();
+  }
+}
+
+template <typename KeyT>
+void TestDeviceSideLaunch(int min_segments, int max_segments)
+{
+  const int max_items = 10000000;
+
+  for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++)
+  {
+    Input<KeyT, KeyT> edge_cases =
+      GenRandomInput<KeyT, KeyT>(max_items,
+                                 min_segments,
+                                 max_segments,
+                                 descending);
+
+    TestDeviceSideLaunch(edge_cases);
+  }
+}
+
+template <typename KeyT>
+void TestDeviceSideLaunch()
+{
+  TestDeviceSideLaunch<KeyT>(1 << 2, 1 << 8);
+  TestDeviceSideLaunch<KeyT>(1 << 9, 1 << 19);
+}
+
+#endif // TEST_CDP
+
+void TestUnspecifiedRanges()
+{
+  const std::size_t num_items = 1024 * 1024;
+  const std::size_t max_segments = 42;
+  const std::size_t avg_segment_size = num_items / max_segments;
+
+  for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++)
+  {
+    thrust::host_vector<int> h_offsets_begin;
+    thrust::host_vector<int> h_offsets_end;
+
+    h_offsets_begin.reserve(max_segments + 1);
+    h_offsets_end.reserve(max_segments + 1);
+
+    {
+      int offset = 0;
+
+      for (std::size_t sid = 0; sid < max_segments; sid++)
+      {
+        const int segment_size =
+          RandomValue(static_cast<int>(avg_segment_size));
+        const bool segment_is_utilized = RandomValue(100) > 60;
+
+        if (segment_is_utilized)
+        {
+          h_offsets_begin.push_back(offset);
+          h_offsets_end.push_back(offset + segment_size);
+        }
+
+        offset += segment_size;
+      }
+
+      if (h_offsets_begin.empty())
+      {
+        h_offsets_begin.push_back(avg_segment_size);
+        h_offsets_end.push_back(num_items);
+      }
+    }
+
+    thrust::device_vector<int> keys(num_items);
+    thrust::device_vector<int> values(num_items);
+
+    thrust::sequence(keys.rbegin(), keys.rend());
+    thrust::sequence(values.rbegin(), values.rend());
+
+    thrust::device_vector<int> d_offsets_begin = h_offsets_begin;
+    thrust::device_vector<int> d_offsets_end = h_offsets_end;
+
+    thrust::device_vector<int> expected_keys = keys;
+    thrust::device_vector<int> expected_values = values;
+
+    const int num_segments = static_cast<int>(h_offsets_begin.size());
+
+    for (int sid = 0; sid < num_segments; sid++)
+    {
+      const int segment_begin = h_offsets_begin[sid];
+      const int segment_end = h_offsets_end[sid];
+
+      thrust::sort_by_key(expected_keys.begin() + segment_begin,
+                          expected_keys.begin() + segment_end,
+                          expected_values.begin() + segment_begin);
+    }
+
+    thrust::device_vector<int> result_keys = keys;
+    thrust::device_vector<int> result_values = values;
+
+    {
+      cub::DoubleBuffer<int> keys_buffer(
+          thrust::raw_pointer_cast(keys.data()), 
+          thrust::raw_pointer_cast(result_keys.data()));
+
+      cub::DoubleBuffer<int> values_buffer(
+          thrust::raw_pointer_cast(values.data()), 
+          thrust::raw_pointer_cast(result_values.data()));
+
+      std::size_t temp_storage_bytes{};
+      std::uint8_t *d_temp_storage{};
+
+      CubDebugExit(cub::DeviceSegmentedSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          keys_buffer, values_buffer, 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data())));
+
+      thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+      d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+      CubDebugExit(cub::DeviceSegmentedSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          keys_buffer, values_buffer, 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data())));
+
+      for (int sid = 0; sid < num_segments; sid++)
+      {
+        const int segment_begin = h_offsets_begin[sid];
+        const int segment_end = h_offsets_end[sid];
+
+        if (keys_buffer.selector == 0)
+        {
+          thrust::copy(
+              keys.begin() + segment_begin,
+              keys.begin() + segment_end,
+              result_keys.begin() + segment_begin);
+        }
+                       
+        if (values_buffer.selector == 0)
+        {
+          thrust::copy(
+              values.begin() + segment_begin,
+              values.begin() + segment_end,
+              result_values.begin() + segment_begin);
+        }
+      }
+    }
+
+    AssertEquals(result_values, expected_values);
+    AssertEquals(result_keys, expected_keys);
+
+    thrust::sequence(keys.rbegin(), keys.rend());
+    thrust::sequence(values.rbegin(), values.rend());
+
+    result_keys = keys;
+    result_values = values;
+
+    {
+      std::size_t temp_storage_bytes{};
+      std::uint8_t *d_temp_storage{};
+
+      CubDebugExit(cub::DeviceSegmentedSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          thrust::raw_pointer_cast(keys.data()), 
+          thrust::raw_pointer_cast(result_keys.data()), 
+          thrust::raw_pointer_cast(values.data()), 
+          thrust::raw_pointer_cast(result_values.data()), 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data())));
+
+      thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+      d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+      CubDebugExit(cub::DeviceSegmentedSort::SortPairs(
+          d_temp_storage, temp_storage_bytes, 
+          thrust::raw_pointer_cast(keys.data()), 
+          thrust::raw_pointer_cast(result_keys.data()), 
+          thrust::raw_pointer_cast(values.data()), 
+          thrust::raw_pointer_cast(result_values.data()), 
+          num_items, num_segments, 
+          thrust::raw_pointer_cast(d_offsets_begin.data()),
+          thrust::raw_pointer_cast(d_offsets_end.data())));
+    }
+
+    AssertEquals(result_values, expected_values);
+    AssertEquals(result_keys, expected_keys);
+  }
+}
+
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // %PARAM% TEST_CDP cdp 0:1
+
+#if TEST_CDP == 0
+  TestZeroSegments();
+  TestEmptySegments(1 << 2);
+  TestEmptySegments(1 << 22);
+
+#if TEST_HALF_T
+  Test<half_t, std::uint32_t>();
+#endif
+
+#if TEST_BF_T
+  Test<bfloat16_t, std::uint32_t>();
+#endif
+
+  Test<bool, std::uint64_t>();
+  Test<std::uint8_t, std::uint64_t>();
+  Test<std::int64_t, std::uint32_t>();
+
+#elif TEST_CDP == 1
+  TestDeviceSideLaunch<int>();
+#endif // TEST_CDP
+
+  TestUnspecifiedRanges();
+
+  return 0;
+}
diff --git a/include/cub/test/test_device_select_if.cu b/include/cub/test/test_device_select_if.cu
new file mode 100644
index 0000000..80ca0cd
--- /dev/null
+++ b/include/cub/test/test_device_select_if.cu
@@ -0,0 +1,1118 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::If and DevicePartition::If utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_select.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+
+#include "test_util.h"
+
+#include <cstdio>
+#include <typeinfo>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+// Selection functor type
+template <typename T>
+struct LessThan
+{
+    T compare;
+
+    __host__ __device__ __forceinline__
+    LessThan(T compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const T &a) const {
+        return (a < compare);
+    }
+};
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to select if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<false>             /*is_flagged*/,
+    Int2Type<false>             /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               /*d_flags*/,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<false>             /*is_flagged*/,
+    Int2Type<true>              /*is_partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               /*d_flags*/,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to select flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<true>              /*is_flagged*/,
+    Int2Type<false>             /*partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   /*select_op*/)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    Int2Type<true>              /*is_flagged*/,
+    Int2Type<true>              /*partition*/,
+    int                         timing_timing_iterations,
+    size_t*                     /*d_temp_storage_bytes*/,
+    cudaError_t*                /*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   /*select_op*/)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <int CubBackend,
+          typename IsFlaggedTag,
+          typename IsPartitionTag,
+          typename InputIteratorT,
+          typename FlagIteratorT,
+          typename SelectOpT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  IsFlaggedTag         is_flagged,
+                                  IsPartitionTag       is_partition,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  InputIteratorT       d_in,
+                                  FlagIteratorT        d_flags,
+                                  OutputIteratorT      d_out,
+                                  NumSelectedIteratorT d_num_selected_out,
+                                  OffsetT              num_items,
+                                  SelectOpT            select_op)
+{
+  *d_cdp_error = Dispatch(cub_backend,
+                          is_flagged,
+                          is_partition,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_flags,
+                          d_out,
+                          d_num_selected_out,
+                          num_items,
+                          select_op);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename IsFlaggedTag,
+          typename IsPartitionTag,
+          typename InputIteratorT,
+          typename FlagIteratorT,
+          typename SelectOpT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     IsFlaggedTag   is_flagged,
+                     IsPartitionTag is_partition,
+                     int            timing_timing_iterations,
+                     size_t        *d_temp_storage_bytes,
+                     cudaError_t   *d_cdp_error,
+
+                     void                *d_temp_storage,
+                     size_t              &temp_storage_bytes,
+                     InputIteratorT       d_in,
+                     FlagIteratorT        d_flags,
+                     OutputIteratorT      d_out,
+                     NumSelectedIteratorT d_num_selected_out,
+                     OffsetT              num_items,
+                     SelectOpT            select_op)
+{
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<CUB,
+                              IsFlaggedTag,
+                              IsPartitionTag,
+                              InputIteratorT,
+                              FlagIteratorT,
+                              SelectOpT,
+                              OutputIteratorT,
+                              NumSelectedIteratorT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            is_flagged,
+            is_partition,
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            num_items,
+            select_op);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
+}
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    T*  h_in,
+    int num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        // Initialize each item to a randomly selected value from [0..126]
+        unsigned int value;
+        RandomBits(value, 0, 0, 7);
+        if (value == 127)
+            value = 126;
+        InitValue(INTEGER_SEED, h_in[i], value);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve selection problem (and set corresponding flags)
+ */
+template <
+    typename        InputIteratorT,
+    typename        FlagIteratorT,
+    typename        SelectOpT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    SelectOpT       select_op,
+    T*              h_reference,
+    FlagIteratorT   h_flags,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if ((h_flags[i] = select_op(h_in[i])))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    bool                IS_FLAGGED,
+    bool                IS_PARTITION,
+    typename            DeviceInputIteratorT,
+    typename            FlagT,
+    typename            SelectOpT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT    d_in,
+    FlagT*                  h_flags,
+    SelectOpT               select_op,
+    T*                      h_reference,
+    int                     num_selected,
+    int                     num_items)
+{
+    // Allocate device flags, output, and num-selected
+    FlagT*      d_flags = NULL;
+    T*          d_out = NULL;
+    int*        d_num_selected_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t*         d_temp_storage_bytes = NULL;
+    cudaError_t*    d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+    d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Copy flags and clear device output array
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = (IS_PARTITION) ?
+        CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
+        CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis          = elapsed_millis / g_timing_iterations;
+        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
+        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
+        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
+        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
+
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test on pointer type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_in        = new T[num_items];
+    FlagT*  h_flags     = new FlagT[num_items];
+    T*      h_reference = new T[num_items];
+
+    // Initialize input
+    Initialize(h_in, num_items);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestIterator(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_reference = new T[num_items];
+    FlagT*  h_flags = new FlagT[num_items];
+
+    // Use counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+}
+
+
+/**
+ * Test different selection ratios
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
+    {
+        TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
+    }
+}
+
+
+/**
+ * Test (select vs. partition) and (flagged vs. functor)
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestMethod(
+    int             num_items)
+{
+    // Functor
+    Test<BACKEND, false, false, T>(num_items);
+    Test<BACKEND, false, true, T>(num_items);
+
+    // Flagged
+    Test<BACKEND, true, false, T>(num_items);
+    Test<BACKEND, true, true, T>(num_items);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+#if TEST_CDP == 0
+    TestMethod<CUB, T>(num_items);
+#elif TEST_CDP == 1
+    TestMethod<CDP, T>(num_items);
+#endif // TEST_CDP
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+template<class T0, class T1>
+struct pair_to_col_t
+{
+    __host__ __device__ T0 operator()(const thrust::tuple<T0, T1> &in)
+    {
+        return thrust::get<0>(in);
+    }
+};
+
+template<class T0, class T1>
+struct select_t {
+    __host__ __device__ bool operator()(const thrust::tuple<T0, T1> &in) {
+        return static_cast<T1>(thrust::get<0>(in)) > thrust::get<1>(in);
+    }
+};
+
+template <typename T0, typename T1>
+void TestMixedOp(int num_items)
+{
+    const T0 target_value = static_cast<T0>(42);
+    thrust::device_vector<T0> col_a(num_items, target_value);
+    thrust::device_vector<T1> col_b(num_items, static_cast<T1>(4.2));
+
+    thrust::device_vector<T0> result(num_items);
+
+    auto in = thrust::make_zip_iterator(col_a.begin(), col_b.begin());
+    auto out = thrust::make_transform_output_iterator(result.begin(), pair_to_col_t<T0, T1>{});
+
+    void *d_tmp_storage {};
+    std::size_t tmp_storage_size{};
+    cub::DeviceSelect::If(
+            d_tmp_storage, tmp_storage_size,
+            in, out, thrust::make_discard_iterator(),
+            num_items, select_t<T0, T1>{});
+
+    thrust::device_vector<char> tmp_storage(tmp_storage_size);
+    d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+    cub::DeviceSelect::If(
+            d_tmp_storage, tmp_storage_size,
+            in, out, thrust::make_discard_iterator(),
+            num_items, select_t<T0, T1>{});
+
+    AssertEquals(num_items, thrust::count(result.begin(), result.end(), target_value));
+}
+
+/**
+ * Test different input sizes
+ */
+template <typename T0, typename T1>
+void TestMixed(int num_items)
+{
+    if (num_items < 0)
+    {
+        TestMixedOp<T0, T1>(0);
+        TestMixedOp<T0, T1>(1);
+        TestMixedOp<T0, T1>(100);
+        TestMixedOp<T0, T1>(10000);
+        TestMixedOp<T0, T1>(1000000);
+    }
+    else
+    {
+        TestMixedOp<T0, T1>(num_items);
+    }
+}
+
+void TestFlagsNormalization()
+{
+  const int num_items = 1024 * 1024;
+  thrust::device_vector<int> result(num_items);
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+  CubDebugExit(
+    cub::DeviceSelect::Flagged(d_tmp_storage,
+                               tmp_storage_size,
+                               cub::CountingInputIterator<int>(0),      // in
+                               cub::CountingInputIterator<int>(1),      // flags
+                               thrust::raw_pointer_cast(result.data()), // out
+                               thrust::make_discard_iterator(), // num_out
+                               num_items));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  CubDebugExit(
+    cub::DeviceSelect::Flagged(d_tmp_storage,
+                               tmp_storage_size,
+                               cub::CountingInputIterator<int>(0),      // in
+                               cub::CountingInputIterator<int>(1),      // flags
+                               thrust::raw_pointer_cast(result.data()), // out
+                               thrust::make_discard_iterator(), // num_out
+                               num_items));
+
+  AssertTrue(thrust::equal(result.begin(),
+                           result.end(),
+                           thrust::make_counting_iterator(0)));
+}
+
+void TestFlagsAliasingInPartition()
+{
+  int h_items[]{0, 1, 0, 2, 0, 3, 0, 4, 0, 5};
+  constexpr int num_items = sizeof(h_items) / sizeof(h_items[0]);
+
+  int *d_in{};
+  int *d_out{};
+
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in, sizeof(h_items)));
+  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_out, sizeof(h_items)));
+
+  CubDebugExit(
+    cudaMemcpy(d_in, h_items, sizeof(h_items), cudaMemcpyHostToDevice));
+
+  // alias flags and keys
+  int *d_flags = d_in;
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+
+  CubDebugExit(
+    cub::DevicePartition::Flagged(d_tmp_storage,
+                                  tmp_storage_size,
+                                  d_in,
+                                  d_flags,
+                                  d_out,
+                                  thrust::make_discard_iterator(), // num_out
+                                  num_items));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  CubDebugExit(
+    cub::DevicePartition::Flagged(d_tmp_storage,
+                                  tmp_storage_size,
+                                  d_in,
+                                  d_flags,
+                                  d_out,
+                                  thrust::make_discard_iterator(), // num_out
+                                  num_items));
+
+  AssertTrue(thrust::equal(thrust::device,
+                           d_out,
+                           d_out + num_items / 2,
+                           thrust::make_counting_iterator(1)));
+
+  AssertEquals(
+    thrust::count(thrust::device, d_out + num_items / 2, d_out + num_items, 0),
+    num_items / 2);
+
+  CubDebugExit(g_allocator.DeviceFree(d_out));
+  CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+struct Odd
+{
+  __host__ __device__ bool operator()(int v) const { return v % 2; }
+};
+
+void TestIfInPlace()
+{
+  const int num_items = 4 * 1024 * 1024;
+  const int num_iters = 42;
+
+  thrust::device_vector<int> num_out(1);
+  thrust::device_vector<int> data(num_items);
+  thrust::device_vector<int> reference(num_items);
+  thrust::device_vector<int> reference_out(1);
+
+  thrust::sequence(data.begin(), data.end());
+
+  Odd op{};
+
+  int *d_num_out = thrust::raw_pointer_cast(num_out.data());
+  int *d_data = thrust::raw_pointer_cast(data.data());
+  int *d_reference = thrust::raw_pointer_cast(reference.data());
+  int *d_reference_out = thrust::raw_pointer_cast(reference_out.data());
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+
+  CubDebugExit(
+    cub::DeviceSelect::If(d_tmp_storage,
+                          tmp_storage_size,
+                          d_data,
+                          d_num_out,
+                          num_items,
+                          op));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  thrust::default_random_engine g{};
+
+  for (int iter = 0; iter < num_iters; iter++)
+  {
+    thrust::shuffle(data.begin(), data.end(), g);
+
+    CubDebugExit(
+      cub::DeviceSelect::If(d_tmp_storage,
+                            tmp_storage_size,
+                            d_data,
+                            d_reference,
+                            d_reference_out,
+                            num_items,
+                            op));
+
+    CubDebugExit(
+      cub::DeviceSelect::If(d_tmp_storage,
+                            tmp_storage_size,
+                            d_data,
+                            d_num_out,
+                            num_items,
+                            op));
+
+    AssertEquals(num_out, reference_out);
+    const int num_selected = num_out[0];
+
+    const bool match_reference = thrust::equal(reference.begin(),
+                                               reference.begin() + num_selected,
+                                               data.begin());
+    AssertTrue(match_reference);
+  }
+}
+
+void TestFlaggedInPlace()
+{
+  const int num_items = 4 * 1024 * 1024;
+  const int num_iters = 42;
+
+  thrust::device_vector<int> num_out(1);
+  thrust::device_vector<int> data(num_items);
+  thrust::device_vector<bool> flags(num_items);
+
+  int h_num_out{};
+  int *d_num_out = thrust::raw_pointer_cast(num_out.data());
+  int *d_data = thrust::raw_pointer_cast(data.data());
+  bool *d_flags = thrust::raw_pointer_cast(flags.data());
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+
+  CubDebugExit(
+    cub::DeviceSelect::Flagged(d_tmp_storage,
+                               tmp_storage_size,
+                               d_data,
+                               d_flags,
+                               d_num_out,
+                               num_items));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  thrust::default_random_engine g{};
+
+  for (int iter = 0; iter < num_iters; iter++)
+  {
+    const int num_selected = RandomValue(num_items);
+
+    thrust::sequence(data.begin(), data.end());
+    thrust::fill(flags.begin(), flags.begin() + num_selected, true);
+    thrust::fill(flags.begin() + num_selected, flags.end(), false);
+    thrust::shuffle(flags.begin(), flags.end(), g);
+
+    CubDebugExit(
+      cub::DeviceSelect::Flagged(d_tmp_storage,
+                                 tmp_storage_size,
+                                 d_data,
+                                 d_flags,
+                                 d_num_out,
+                                 num_items));
+
+    cudaMemcpy(&h_num_out, d_num_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    AssertEquals(num_selected, h_num_out);
+
+    auto selection_perm_begin = thrust::make_permutation_iterator(flags.begin(),
+                                                                  data.begin());
+    auto selection_perm_end = selection_perm_begin + num_selected;
+
+    AssertEquals(num_selected,
+                 thrust::count(selection_perm_begin, selection_perm_end, true));
+  }
+}
+
+void TestFlaggedInPlaceWithAliasedFlags()
+{
+  const int num_items = 1024 * 1024;
+  const int num_iters = 42;
+
+  thrust::device_vector<int> num_out(1);
+  thrust::device_vector<int> data(num_items);
+  thrust::device_vector<int> reference(num_items);
+  thrust::device_vector<int> flags(num_items);
+
+  int h_num_out{};
+  int *d_num_out = thrust::raw_pointer_cast(num_out.data());
+  int *d_data = thrust::raw_pointer_cast(data.data());
+  int *d_flags = d_data; // alias
+  int *d_allocated_flags = thrust::raw_pointer_cast(data.data());
+  int *d_reference = thrust::raw_pointer_cast(reference.data());
+
+  void *d_tmp_storage{};
+  std::size_t tmp_storage_size{};
+
+  CubDebugExit(
+    cub::DeviceSelect::Flagged(d_tmp_storage,
+                               tmp_storage_size,
+                               d_data,
+                               d_flags,
+                               d_num_out,
+                               num_items));
+
+  thrust::device_vector<char> tmp_storage(tmp_storage_size);
+  d_tmp_storage = thrust::raw_pointer_cast(tmp_storage.data());
+
+  thrust::default_random_engine g{};
+
+  for (int iter = 0; iter < num_iters; iter++)
+  {
+    const int num_selected = RandomValue(num_items);
+
+    thrust::sequence(data.begin(), data.begin() + num_selected, 1);
+    thrust::fill(data.begin() + num_selected, data.end(), 0);
+    thrust::shuffle(data.begin(), data.end(), g);
+
+    CubDebugExit(
+      cub::DeviceSelect::Flagged(d_tmp_storage,
+                                 tmp_storage_size,
+                                 d_data,      // in
+                                 d_allocated_flags,
+                                 d_reference, // out
+                                 d_num_out,
+                                 num_items));
+
+    CubDebugExit(
+      cub::DeviceSelect::Flagged(d_tmp_storage,
+                                 tmp_storage_size,
+                                 d_data,
+                                 d_flags,
+                                 d_num_out,
+                                 num_items));
+
+    cudaMemcpy(&h_num_out, d_num_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    AssertEquals(num_selected, h_num_out);
+
+    const bool match_reference = thrust::equal(reference.begin(),
+                                               reference.begin() + num_selected,
+                                               data.begin());
+    AssertTrue(match_reference);
+  }
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    float select_ratio      = 0.5;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("ratio", select_ratio);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--ratio=<selection ratio, default 0.5>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+    // %PARAM% TEST_CDP cdp 0:1
+
+    TestFlagsAliasingInPartition();
+
+    TestFlaggedInPlace();
+    TestFlaggedInPlaceWithAliasedFlags();
+    TestIfInPlace();
+
+    Test<unsigned char>(num_items);
+    Test<unsigned short>(num_items);
+    Test<unsigned int>(num_items);
+    Test<unsigned long long>(num_items);
+
+    Test<uchar2>(num_items);
+    Test<ushort2>(num_items);
+    Test<uint2>(num_items);
+    Test<ulonglong2>(num_items);
+
+    Test<uchar4>(num_items);
+    Test<ushort4>(num_items);
+    Test<uint4>(num_items);
+    Test<ulonglong4>(num_items);
+
+    Test<TestFoo>(num_items);
+    Test<TestBar>(num_items);
+
+    TestMixed<int, double>(num_items);
+    TestFlagsNormalization();
+
+    return 0;
+}
+
+
+
diff --git a/include/cub/test/test_device_select_unique.cu b/include/cub/test/test_device_select_unique.cu
new file mode 100644
index 0000000..fc4563b
--- /dev/null
+++ b/include/cub/test/test_device_select_unique.cu
@@ -0,0 +1,616 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::Unique utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_select.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "test_util.h"
+
+#include <cstdio>
+#include <typeinfo>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <int CubBackend,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  InputIteratorT       d_in,
+                                  OutputIteratorT      d_out,
+                                  NumSelectedIteratorT d_num_selected_out,
+                                  OffsetT              num_items)
+{
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          d_num_selected_out,
+                          num_items);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     int          timing_timing_iterations,
+                     size_t      *d_temp_storage_bytes,
+                     cudaError_t *d_cdp_error,
+
+                     void                *d_temp_storage,
+                     size_t              &temp_storage_bytes,
+                     InputIteratorT       d_in,
+                     OutputIteratorT      d_out,
+                     NumSelectedIteratorT d_num_selected_out,
+                     OffsetT              num_items)
+{
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<CUB,
+                              InputIteratorT,
+                              OutputIteratorT,
+                              NumSelectedIteratorT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            d_num_selected_out,
+            num_items);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
+}
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    T               *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_reference[num_selected] = h_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_in[i] != h_in[i - 1])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT d_in,
+    T                   *h_reference,
+    int                 num_selected,
+    int                 num_items)
+{
+    // Allocate device output array and num selected
+    T       *d_out = NULL;
+    int     *d_num_selected_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis        = elapsed_millis / g_timing_iterations;
+        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth    = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    T*  h_in        = new T[num_items];
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    Initialize(entropy_reduction, h_in, num_items, max_segment);
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T),
+        entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceSelect on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestIterator(
+    int             num_items)
+{
+    // Use a counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Allocate host arrays
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
+    {
+        TestPointer<BACKEND, T>(num_items, 0, max_segment);
+        TestPointer<BACKEND, T>(num_items, 2, max_segment);
+        TestPointer<BACKEND, T>(num_items, 7, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+#if TEST_CDP == 0
+    Test<CUB, T>(num_items);
+#elif TEST_CDP == 1
+    Test<CDP, T>(num_items);
+#endif // TEST_CDP
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+template <typename T>
+void TestIteratorOp(int num_items)
+{
+  void *d_temp_storage{};
+  std::size_t temp_storage_size{};
+
+  thrust::device_vector<int> num_selected(1);
+
+  auto in = thrust::make_counting_iterator(static_cast<T>(0));
+  auto out = thrust::make_discard_iterator();
+
+  CubDebugExit(cub::DeviceSelect::Unique(d_temp_storage,
+                                         temp_storage_size,
+                                         in,
+                                         out,
+                                         num_selected.begin(),
+                                         num_items));
+
+  thrust::device_vector<char> temp_storage(temp_storage_size);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  CubDebugExit(cub::DeviceSelect::Unique(d_temp_storage,
+                                         temp_storage_size,
+                                         in,
+                                         out,
+                                         num_selected.begin(),
+                                         num_items));
+
+  AssertEquals(num_selected[0], num_items);
+}
+
+template <typename T>
+void TestIterator(int num_items)
+{
+  if (num_items < 0)
+  {
+    TestIteratorOp<T>(0);
+    TestIteratorOp<T>(1);
+    TestIteratorOp<T>(100);
+    TestIteratorOp<T>(10000);
+    TestIteratorOp<T>(1000000);
+  }
+  else
+  {
+    TestIteratorOp<T>(num_items);
+  }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+    // %PARAM% TEST_CDP cdp 0:1
+
+    // Test different input types
+    Test<unsigned char>(num_items);
+    Test<unsigned short>(num_items);
+    Test<unsigned int>(num_items);
+    Test<unsigned long long>(num_items);
+
+    Test<uchar2>(num_items);
+    Test<ushort2>(num_items);
+    Test<uint2>(num_items);
+    Test<ulonglong2>(num_items);
+
+    Test<uchar4>(num_items);
+    Test<ushort4>(num_items);
+    Test<uint4>(num_items);
+    Test<ulonglong4>(num_items);
+
+    Test<TestFoo>(num_items);
+    Test<TestBar>(num_items);
+
+    TestIterator<int>(num_items);
+
+    return 0;
+}
+
+
+
diff --git a/include/cub/test/test_device_select_unique_by_key.cu b/include/cub/test/test_device_select_unique_by_key.cu
new file mode 100644
index 0000000..8bc9913
--- /dev/null
+++ b/include/cub/test/test_device_select_unique_by_key.cu
@@ -0,0 +1,631 @@
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::Unique utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_select.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "test_util.h"
+
+#include <cstdio>
+#include <typeinfo>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyOutputIteratorT, typename ValueOutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               /*dispatch_to*/,
+    int                         timing_timing_iterations,
+    size_t                      */*d_temp_storage_bytes*/,
+    cudaError_t                 */*d_cdp_error*/,
+
+    void*                       d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    ValueInputIteratorT         d_values_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueOutputIteratorT        d_values_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+#if TEST_CDP == 1
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <int CubBackend,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  KeyInputIteratorT    d_keys_in,
+                                  ValueInputIteratorT  d_values_in,
+                                  KeyOutputIteratorT   d_keys_out,
+                                  ValueOutputIteratorT d_values_out,
+                                  NumSelectedIteratorT d_num_selected_out,
+                                  OffsetT              num_items)
+{
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys_in,
+                          d_values_in,
+                          d_keys_out,
+                          d_values_out,
+                          d_num_selected_out,
+                          num_items);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+}
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     int          timing_timing_iterations,
+                     size_t      *d_temp_storage_bytes,
+                     cudaError_t *d_cdp_error,
+
+                     void                *d_temp_storage,
+                     size_t              &temp_storage_bytes,
+                     KeyInputIteratorT    d_keys_in,
+                     ValueInputIteratorT  d_values_in,
+                     KeyOutputIteratorT   d_keys_out,
+                     ValueOutputIteratorT d_values_out,
+                     NumSelectedIteratorT d_num_selected_out,
+                     OffsetT              num_items)
+{
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
+      .doit(CDPDispatchKernel<CUB,
+                              KeyInputIteratorT,
+                              ValueInputIteratorT,
+                              KeyOutputIteratorT,
+                              ValueOutputIteratorT,
+                              NumSelectedIteratorT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_keys_out,
+            d_values_out,
+            d_num_selected_out,
+            num_items);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
+}
+
+#endif // TEST_CDP
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <
+    typename        KeyInputIteratorT,
+    typename        ValueInputIteratorT,
+    typename        KeyT,
+    typename        ValueT>
+int Solve(
+    KeyInputIteratorT    h_keys_in,
+    ValueInputIteratorT  h_values_in,
+    KeyT                 *h_keys_reference,
+    ValueT               *h_values_reference,
+    int                  num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_keys_reference[num_selected] = h_keys_in[0];
+        h_values_reference[num_selected] = h_values_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_keys_in[i] != h_keys_in[i - 1])
+        {
+            h_keys_reference[num_selected] = h_keys_in[i];
+            h_values_reference[num_selected] = h_values_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            KeyInputIteratorT,
+    typename            ValueInputIteratorT,
+    typename            KeyT,
+    typename            ValueT>
+void Test(
+    KeyInputIteratorT    d_keys_in,
+    ValueInputIteratorT  d_values_in,
+    KeyT                 *h_keys_reference,
+    ValueT               *h_values_reference,
+    int                  num_selected,
+    int                  num_items)
+{
+    // Allocate device output array and num selected
+    KeyT    *d_keys_out = NULL;
+    ValueT  *d_values_out = NULL;
+    int     *d_num_selected_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
+    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare11 = CompareDeviceResults(h_keys_reference, d_keys_out, num_selected, true, g_verbose);
+    int compare12 = CompareDeviceResults(h_values_reference, d_values_out, num_selected, true, g_verbose);
+    int compare1 = compare11 && compare12;
+    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis        = elapsed_millis / g_timing_iterations;
+        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth    = float((num_items + num_selected) * (sizeof(KeyT) + sizeof(ValueT))) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
+    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    KeyT*    h_keys_in        = new KeyT[num_items];
+    ValueT*  h_values_in      = new ValueT[num_items];
+    KeyT*    h_keys_reference = new KeyT[num_items];
+    ValueT*  h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    Initialize(entropy_reduction, h_values_in, num_items, max_segment);
+    int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items);
+
+    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(KeyT).name(),
+        (int) sizeof(KeyT),
+        entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT *d_keys_in = NULL;
+    ValueT *d_values_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_values_in) delete[] h_values_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
+}
+
+
+/**
+ * Test DeviceSelect on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT>
+void TestIterator(
+    int             num_items)
+{
+    // Use a counting iterator as the input
+    CountingInputIterator<KeyT, int> h_keys_in(0);
+    CountingInputIterator<ValueT, int> h_values_in(0);
+
+    // Allocate host arrays
+    KeyT*    h_keys_reference   = new KeyT[num_items];
+    ValueT*  h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items);
+
+    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
+        (BACKEND == CDP) ? "CDP CUB" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(KeyT).name(),
+        (int) sizeof(ValueT));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND>(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT>
+void Test(
+    int             num_items)
+{
+    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
+    {
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment);
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment);
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        KeyT,
+    typename        ValueT>
+void TestOp(
+    int             num_items)
+{
+#if TEST_CDP == 0
+    Test<CUB, KeyT, ValueT>(num_items);
+#elif TEST_CDP == 1
+    Test<CDP, KeyT, ValueT>(num_items);
+#endif // TEST_CDP
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        KeyT,
+    typename        ValueT>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<KeyT, ValueT>(0);
+        TestOp<KeyT, ValueT>(1);
+        TestOp<KeyT, ValueT>(100);
+        TestOp<KeyT, ValueT>(10000);
+        TestOp<KeyT, ValueT>(1000000);
+    }
+    else
+    {
+        TestOp<KeyT, ValueT>(num_items);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+    // %PARAM% TEST_CDP cdp 0:1
+
+    // Test different input types
+    Test<unsigned char, int>(num_items);
+    Test<unsigned char, long>(num_items);
+    Test<unsigned short, int>(num_items);
+    Test<unsigned short, long>(num_items);
+    Test<unsigned int, int>(num_items);
+    Test<unsigned int, long>(num_items);
+    Test<unsigned long long, int>(num_items);
+    Test<unsigned long long, long>(num_items);
+
+    Test<uchar2, uint2>(num_items);
+    Test<uchar2, ulonglong2>(num_items);
+    Test<ushort2, uint2>(num_items);
+    Test<ushort2, ulonglong2>(num_items);
+    Test<uint2, uint2>(num_items);
+    Test<uint2, ulonglong2>(num_items);
+    Test<ulonglong2, uint2>(num_items);
+    Test<ulonglong2, ulonglong2>(num_items);
+
+    Test<uchar4, uint4>(num_items);
+    Test<uchar4, ulonglong4>(num_items);
+    Test<ushort4, uint4>(num_items);
+    Test<ushort4, ulonglong4>(num_items);
+    Test<uint4, uint4>(num_items);
+    Test<uint4, ulonglong4>(num_items);
+    Test<ulonglong4, uint4>(num_items);
+    Test<ulonglong4, ulonglong4>(num_items);
+
+    Test<TestFoo, TestFoo>(num_items);
+    Test<TestFoo, TestBar>(num_items);
+    Test<TestFoo, int>(num_items);
+    Test<TestBar, TestFoo>(num_items);
+    Test<TestBar, TestBar>(num_items);
+    Test<TestBar, int>(num_items);
+
+    return 0;
+}
+
+
+
diff --git a/include/cub/test/test_device_spmv.cu b/include/cub/test/test_device_spmv.cu
new file mode 100644
index 0000000..119e9e4
--- /dev/null
+++ b/include/cub/test/test_device_spmv.cu
@@ -0,0 +1,594 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/host_vector.h>
+#include <thrust/mismatch.h>
+#include <thrust/scan.h>
+
+#include <cub/device/device_spmv.cuh>
+#include <cub/util_debug.cuh>
+
+#include <iostream>
+#include <type_traits>
+#include <typeinfo>
+
+#include "test_util.h"
+
+bool g_verbose = false;
+
+//==============================================================================
+// Casts char types to int for numeric printing
+template <typename T>
+T print_cast(T val)
+{
+  return val;
+}
+
+int print_cast(char val) { return static_cast<int>(val); }
+
+int print_cast(signed char val) { return static_cast<int>(val); }
+
+int print_cast(unsigned char val) { return static_cast<int>(val); }
+
+//==============================================================================
+// Print a vector to out
+template <typename VectorT>
+void print_vector(std::ostream& out, const VectorT& vec)
+{
+  bool first = true;
+  for (const auto& val : vec)
+  {
+    if (!first)
+    {
+      out << ", ";
+    }
+    first = false;
+    out << print_cast(val);
+  }
+}
+
+//==============================================================================
+// Simple CSR matrix implementation.
+// HostStorage controls whether data is stored on the host or device.
+// Use the host_csr_matrix and device_csr_matrix aliases for code clarity.
+template <typename ValueT, bool HostStorage>
+struct csr_matrix
+{
+  csr_matrix(int num_rows, int num_cols)
+      : m_row_offsets(static_cast<size_t>(num_rows + 1), 0)
+      , m_num_rows(num_rows)
+      , m_num_columns(num_cols)
+  {}
+
+  // host/device conversion constructor
+  explicit csr_matrix(const csr_matrix<ValueT, !HostStorage>& other)
+      : m_values(other.m_values)
+      , m_row_offsets(other.m_row_offsets)
+      , m_column_indices(other.m_column_indices)
+      , m_num_rows(other.m_num_rows)
+      , m_num_columns(other.m_num_columns)
+      , m_num_nonzeros(other.m_num_nonzeros)
+  {}
+
+  // Note that this must append to the values array. Finish filling each row
+  // before adding to the next, and each row's columns must be added in order.
+  // Must call `finalize` once all items are added.
+  void append_value(int row, int col, ValueT value)
+  {
+    ++m_num_nonzeros;
+    ++m_row_offsets[row];
+    m_column_indices.push_back(col);
+    m_values.push_back(std::move(value));
+  }
+
+  void finalize()
+  {
+    thrust::exclusive_scan(m_row_offsets.cbegin(),
+                           m_row_offsets.cend(),
+                           m_row_offsets.begin());
+    AssertEquals(m_row_offsets.back(), m_num_nonzeros);
+  }
+
+  const ValueT* get_values() const
+  {
+    return thrust::raw_pointer_cast(m_values.data());
+  }
+
+  const int* get_row_offsets() const
+  {
+    return thrust::raw_pointer_cast(m_row_offsets.data());
+  }
+
+  int get_row_offset(int row) const { return m_row_offsets[row]; }
+
+  int get_row_num_nonzero(int row) const
+  {
+    return m_row_offsets[row + 1] - m_row_offsets[row];
+  }
+
+  const int* get_column_indices() const
+  {
+    return thrust::raw_pointer_cast(m_column_indices.data());
+  }
+
+  int get_num_rows() const { return m_num_rows; }
+
+  int get_num_columns() const { return m_num_columns; }
+
+  int get_num_nonzeros() const { return m_num_nonzeros; }
+
+  void print_internals(std::ostream& out) const
+  {
+    out << (HostStorage ? "host" : "device") << "_csr_matrix"
+        << "(" << m_num_rows << ", " << m_num_columns << ")\n"
+        << " - num_elems:   " << (m_num_rows * m_num_columns) << "\n"
+        << " - num_nonzero: " << m_num_nonzeros << "\n"
+        << " - row_offsets:\n     [";
+    print_vector(out, m_row_offsets);
+    out << "]\n"
+        << " - column_indices:\n     [";
+    print_vector(out, m_column_indices);
+    out << "]\n"
+        << " - values:\n     [";
+    print_vector(out, m_values);
+    out << "]\n";
+  }
+
+  void print_summary(std::ostream& out) const
+  {
+    const int num_elems = m_num_rows * m_num_columns;
+    const float fill_ratio =
+      num_elems == 0
+        ? 0.f
+        : (static_cast<float>(m_num_nonzeros) / static_cast<float>(num_elems));
+
+    out << m_num_rows << "x" << m_num_columns << ", " << m_num_nonzeros << "/"
+        << num_elems << " (" << fill_ratio << ")\n";
+  }
+
+  friend class csr_matrix<ValueT, !HostStorage>;
+
+private:
+  template <typename VecValueT>
+  using vector_t = cub::detail::conditional_t<HostStorage,
+                                              thrust::host_vector<VecValueT>,
+                                              thrust::device_vector<VecValueT>>;
+
+  vector_t<ValueT> m_values;
+  vector_t<int> m_row_offsets;
+  vector_t<int> m_column_indices;
+
+  int m_num_rows{0};
+  int m_num_columns{0};
+  int m_num_nonzeros{0};
+};
+
+//==============================================================================
+// Convenience aliases for host/device csr_matrix types.
+template <typename ValueT>
+using host_csr_matrix = csr_matrix<ValueT, true>;
+
+template <typename ValueT>
+using device_csr_matrix = csr_matrix<ValueT, false>;
+
+//==============================================================================
+// Compare two floats within a tolerance.
+// This mimics the approach used by Thrust's ASSERT_ALMOST_EQUAL checks.
+template <typename ValueT>
+struct fp_almost_equal_functor
+{
+  __host__ __device__ bool operator()(ValueT v1, ValueT v2) const
+  {
+    constexpr double r_tol = 1e-3;
+    constexpr double a_tol = 1e-2;
+    const double limit     = r_tol * (std::fabs(v1) + std::fabs(v2)) + a_tol;
+    return std::fabs(v1 - v2) <= limit;
+  }
+};
+
+//==============================================================================
+// Compare the reference and cub output vectors.
+// Use fuzzy check for floating point values.
+template <typename ValueT>
+bool compare_results(std::true_type /* is_fp */,
+                     const thrust::host_vector<ValueT>& h_vec1,
+                     const thrust::device_vector<ValueT>& d_vec2)
+{
+  thrust::device_vector<ValueT> d_vec1(h_vec1);
+  auto err = thrust::mismatch(d_vec1.cbegin(),
+                              d_vec1.cend(),
+                              d_vec2.cbegin(),
+                              fp_almost_equal_functor<ValueT>{});
+  if (err.first == d_vec1.cend() || err.second == d_vec2.cend())
+  {
+    return true;
+  }
+  else
+  {
+    thrust::host_vector<ValueT> h_vec2(d_vec2);
+    const auto idx = thrust::distance(d_vec1.cbegin(), err.first);
+    std::cerr << "Mismatch at position " << idx << ": "
+              << print_cast(ValueT{h_vec1[idx]}) << " vs "
+              << print_cast(ValueT{h_vec2[idx]}) << std::endl;
+    return false;
+  }
+};
+
+template <typename ValueT>
+bool compare_results(std::false_type /* is_fp */,
+                     const thrust::host_vector<ValueT>& h_vec1,
+                     const thrust::device_vector<ValueT>& d_vec2)
+{
+
+  thrust::device_vector<ValueT> d_vec1(h_vec1);
+  auto err = thrust::mismatch(d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin());
+  if (err.first == d_vec1.cend() || err.second == d_vec2.cend())
+  {
+    return true;
+  }
+  else
+  {
+    thrust::host_vector<ValueT> h_vec2(d_vec2);
+    const auto idx = thrust::distance(d_vec1.cbegin(), err.first);
+    std::cerr << "Mismatch at position " << idx << ": "
+              << print_cast(ValueT{h_vec1[idx]}) << " vs "
+              << print_cast(ValueT{h_vec2[idx]}) << std::endl;
+    return false;
+  }
+}
+
+//==============================================================================
+// Generate a random host_csr_matrix<ValueT> with the specified dimensions.
+// target_fill_ratio is the target fraction of non-zero elements (may be more
+// or less in the output).
+template <typename ValueT>
+host_csr_matrix<ValueT> make_random_csr_matrix(int num_rows,
+                                               int num_cols,
+                                               float target_fill_ratio)
+{
+  host_csr_matrix<ValueT> mat{num_rows, num_cols};
+
+  for (int row = 0; row < num_rows; ++row)
+  {
+    for (int col = 0; col < num_cols; ++col)
+    {
+      const bool is_non_zero = RandomValue<float>(1.f) < target_fill_ratio;
+      if (!is_non_zero)
+      {
+        continue;
+      }
+
+      if (std::is_floating_point<ValueT>::value)
+      {
+        // Keep fp numbers somewhat small, from -50 -> 50; otherwise we run
+        // into issues with nans/infs
+        ValueT value =
+          (RandomValue(static_cast<ValueT>(100)) - static_cast<ValueT>(50));
+        mat.append_value(row, col, value);
+      }
+      else
+      {
+        ValueT value{};
+        InitValue(RANDOM, value);
+        mat.append_value(row, col, value);
+      }
+    }
+  }
+
+  mat.finalize();
+
+  const int num_elements        = num_rows * num_cols;
+  const float actual_fill_ratio = static_cast<float>(mat.get_num_nonzeros()) /
+                                  static_cast<float>(num_elements);
+
+  if (g_verbose)
+  {
+    printf("Created host_csr_matrix<%s>(%d, %d)\n"
+           " - NumElements: %d\n"
+           " - NumNonZero:  %d\n"
+           " - Target fill: %0.2f%%\n"
+           " - Actual fill: %0.2f%%\n",
+           typeid(ValueT).name(),
+           num_rows,
+           num_cols,
+           num_elements,
+           mat.get_num_nonzeros(),
+           target_fill_ratio,
+           actual_fill_ratio);
+  }
+
+  return mat;
+}
+
+//==============================================================================
+// Fill a vector with random values.
+template <typename ValueT>
+thrust::host_vector<ValueT> make_random_vector(int len)
+{
+  thrust::host_vector<ValueT> vec(len);
+  for (auto& val : vec)
+  {
+    if (std::is_floating_point<ValueT>::value)
+    { // Keep fp numbers somewhat small; otherwise we run into issues with
+      // nans/infs
+      val = RandomValue(static_cast<ValueT>(100)) - static_cast<ValueT>(50);
+    }
+    else
+    {
+      InitValue(RANDOM, val);
+    }
+  }
+  return vec;
+}
+
+//==============================================================================
+// Serial y = Ax computation
+template <typename ValueT>
+void compute_reference_solution(const host_csr_matrix<ValueT>& a,
+                                const thrust::host_vector<ValueT>& x,
+                                thrust::host_vector<ValueT>& y)
+{
+  if (a.get_num_rows() == 0 || a.get_num_columns() == 0)
+  {
+    return;
+  }
+
+  for (int row = 0; row < a.get_num_rows(); ++row)
+  {
+    const int row_offset = a.get_row_offset(row);
+    const int row_length = a.get_row_num_nonzero(row);
+    const int* cols      = a.get_column_indices() + row_offset;
+    const int* cols_end  = cols + row_length;
+    const ValueT* values = a.get_values() + row_offset;
+
+    ValueT accum{};
+    while (cols < cols_end)
+    {
+      accum += (*values++) * x[*cols++];
+    }
+    y[row] = accum;
+  }
+}
+
+//==============================================================================
+// cub::DeviceSpmv::CsrMV y = Ax computation
+template <typename ValueT>
+void compute_cub_solution(const device_csr_matrix<ValueT>& a,
+                          const thrust::device_vector<ValueT>& x,
+                          thrust::device_vector<ValueT>& y)
+{
+  thrust::device_vector<char> temp_storage;
+  std::size_t temp_storage_bytes{};
+  auto err = cub::DeviceSpmv::CsrMV(nullptr,
+                                    temp_storage_bytes,
+                                    a.get_values(),
+                                    a.get_row_offsets(),
+                                    a.get_column_indices(),
+                                    thrust::raw_pointer_cast(x.data()),
+                                    thrust::raw_pointer_cast(y.data()),
+                                    a.get_num_rows(),
+                                    a.get_num_columns(),
+                                    a.get_num_nonzeros());
+  CubDebugExit(err);
+
+  temp_storage.resize(temp_storage_bytes);
+
+  err = cub::DeviceSpmv::CsrMV(thrust::raw_pointer_cast(temp_storage.data()),
+                               temp_storage_bytes,
+                               a.get_values(),
+                               a.get_row_offsets(),
+                               a.get_column_indices(),
+                               thrust::raw_pointer_cast(x.data()),
+                               thrust::raw_pointer_cast(y.data()),
+                               a.get_num_rows(),
+                               a.get_num_columns(),
+                               a.get_num_nonzeros());
+  CubDebugExit(err);
+}
+
+//==============================================================================
+// Compute y = Ax twice, one reference and one cub::DeviceSpmv, and compare the
+// results.
+template <typename ValueT>
+void test_spmv(const host_csr_matrix<ValueT>& h_a,
+               const thrust::host_vector<ValueT>& h_x)
+{
+  if (g_verbose)
+  {
+    std::cout << "Testing cub::DeviceSpmv on inputs:\n";
+    h_a.print_internals(std::cout);
+    std::cout << "x vector:\n  [";
+    print_vector(std::cout, h_x);
+    std::cout << "]" << std::endl;
+  }
+  else
+  {
+    h_a.print_summary(std::cout);
+  }
+
+  const device_csr_matrix<ValueT> d_a(h_a);
+  const thrust::device_vector<ValueT> d_x(h_x);
+
+  thrust::host_vector<ValueT> h_y(h_a.get_num_rows());
+  thrust::device_vector<ValueT> d_y(d_a.get_num_rows());
+
+  compute_reference_solution(h_a, h_x, h_y);
+  compute_cub_solution(d_a, d_x, d_y);
+
+  if (g_verbose)
+  {
+    std::cout << "reference output:\n  [";
+    print_vector(std::cout, h_y);
+    std::cout << "]\n";
+    thrust::host_vector<ValueT> tmp_y(d_y);
+    std::cout << "cub::DeviceSpmv output:\n  [";
+    print_vector(std::cout, tmp_y);
+    std::cout << "]" << std::endl;
+  }
+
+  constexpr auto is_fp = std::is_floating_point<ValueT>{};
+  AssertTrue(compare_results(is_fp, h_y, d_y));
+}
+
+//==============================================================================
+// Test example from cub::DeviceSpmv documentation
+template <typename ValueT>
+void test_doc_example()
+{
+  std::cout << "\n\ntest_doc_example<" << typeid(ValueT).name() << ">()"
+            << std::endl;
+
+  host_csr_matrix<ValueT> h_a(9, 9);
+  h_a.append_value(0, 1, ValueT{1});
+  h_a.append_value(0, 3, ValueT{1});
+  h_a.append_value(1, 0, ValueT{1});
+  h_a.append_value(1, 2, ValueT{1});
+  h_a.append_value(1, 4, ValueT{1});
+  h_a.append_value(2, 1, ValueT{1});
+  h_a.append_value(2, 5, ValueT{1});
+  h_a.append_value(3, 0, ValueT{1});
+  h_a.append_value(3, 4, ValueT{1});
+  h_a.append_value(3, 6, ValueT{1});
+  h_a.append_value(4, 1, ValueT{1});
+  h_a.append_value(4, 3, ValueT{1});
+  h_a.append_value(4, 5, ValueT{1});
+  h_a.append_value(4, 7, ValueT{1});
+  h_a.append_value(5, 2, ValueT{1});
+  h_a.append_value(5, 4, ValueT{1});
+  h_a.append_value(5, 8, ValueT{1});
+  h_a.append_value(6, 3, ValueT{1});
+  h_a.append_value(6, 7, ValueT{1});
+  h_a.append_value(7, 4, ValueT{1});
+  h_a.append_value(7, 6, ValueT{1});
+  h_a.append_value(7, 8, ValueT{1});
+  h_a.append_value(8, 5, ValueT{1});
+  h_a.append_value(8, 7, ValueT{1});
+  h_a.finalize();
+
+  thrust::host_vector<ValueT> h_x(9, ValueT{1});
+
+  test_spmv(h_a, h_x);
+}
+
+//==============================================================================
+// Generate and test a random SpMV operation with the given parameters.
+template <typename ValueT>
+void test_random(int rows, int cols, float target_fill_ratio)
+{
+  std::cout << "\n\ntest_random<" << typeid(ValueT).name() << ">(" << rows
+            << ", " << cols << ", " << target_fill_ratio << ")" << std::endl;
+
+  host_csr_matrix<ValueT> h_a =
+    make_random_csr_matrix<ValueT>(rows, cols, target_fill_ratio);
+  thrust::host_vector<ValueT> h_x = make_random_vector<ValueT>(cols);
+
+  test_spmv(h_a, h_x);
+}
+
+//==============================================================================
+// Dispatch many random SpMV tests over a variety of parameters.
+template <typename ValueT>
+void test_random()
+{
+  test_random<ValueT>(0, 0, 1.f);
+  test_random<ValueT>(0, 1, 1.f);
+  test_random<ValueT>(1, 0, 1.f);
+
+  const int dim_min = 1;
+  const int dim_max = 10000;
+
+  const int max_num_elems = 100000;
+
+  const float ratio_min  = 0.f;
+  const float ratio_max  = 1.1f; // a lil over to account for fp errors
+  const float ratio_step = 0.3334f;
+
+  for (int rows = dim_min; rows < dim_max; rows <<= 1)
+  {
+    for (int cols = dim_min; cols < dim_max; cols <<= 1)
+    {
+      if (rows * cols >= max_num_elems)
+      {
+        continue;
+      }
+
+      for (float ratio = ratio_min; ratio < ratio_max; ratio += ratio_step)
+      {
+        test_random<ValueT>(rows, cols, ratio);
+        // Test nearby non-power-of-two dims:
+        test_random<ValueT>(rows + 97, cols + 83, ratio);
+      }
+    }
+  }
+}
+
+//==============================================================================
+// Dispatch many SpMV tests for a given ValueT.
+template <typename ValueT>
+void test_type()
+{
+  test_doc_example<ValueT>();
+  test_random<ValueT>();
+}
+
+//==============================================================================
+// Dispatch many SpMV tests over a variety of types.
+void test_types()
+{
+  test_type<float>();
+  test_type<double>();
+  test_type<signed char>();
+  test_type<int>();
+  test_type<long long>();
+}
+
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--v] verbose"
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  CubDebugExit(args.DeviceInit());
+
+  test_types();
+}
diff --git a/include/cub/test/test_device_three_way_partition.cu b/include/cub/test/test_device_three_way_partition.cu
new file mode 100644
index 0000000..27fdf00
--- /dev/null
+++ b/include/cub/test/test_device_three_way_partition.cu
@@ -0,0 +1,594 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_partition.cuh>
+#include <test_util.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/partition.h>
+#include <thrust/random.h>
+#include <thrust/reduce.h>
+#include <thrust/shuffle.h>
+#include <thrust/tabulate.h>
+
+using namespace cub;
+
+template <typename T>
+struct LessThan
+{
+  T compare;
+
+  explicit __host__ LessThan(T compare)
+      : compare(compare)
+  {}
+
+  __device__ bool operator()(const T &a) const
+  {
+    return a < compare;
+  }
+};
+
+template <typename T>
+struct EqualTo
+{
+  T compare;
+
+  explicit __host__ EqualTo(T compare)
+    : compare(compare)
+  {}
+
+  __device__ bool operator()(const T &a) const
+  {
+    return a == compare;
+  }
+};
+
+template <typename T>
+struct GreaterOrEqual
+{
+  T compare;
+
+  explicit __host__ GreaterOrEqual(T compare)
+    : compare(compare)
+  {}
+
+  __device__ bool operator()(const T &a) const
+  {
+    return a >= compare;
+  }
+};
+
+template <typename T>
+void TestEmpty()
+{
+  int num_items = 0;
+
+  T *in {};
+  T *d_first_part_out {};
+  T *d_second_part_out {};
+  T *d_unselected_out {};
+  T *d_num_selected_out {};
+
+  LessThan<T> le(T{0});
+  GreaterOrEqual<T> ge(T{1});
+
+  std::size_t temp_storage_size {};
+  CubDebugExit(cub::DevicePartition::If(nullptr,
+                                        temp_storage_size,
+                                        in,
+                                        d_first_part_out,
+                                        d_second_part_out,
+                                        d_unselected_out,
+                                        d_num_selected_out,
+                                        num_items,
+                                        le,
+                                        ge));
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_size);
+  std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  CubDebugExit(cub::DevicePartition::If(d_temp_storage,
+                                        temp_storage_size,
+                                        in,
+                                        d_first_part_out,
+                                        d_second_part_out,
+                                        d_unselected_out,
+                                        d_num_selected_out,
+                                        num_items,
+                                        le,
+                                        ge));
+}
+
+template <typename T>
+class ThreeWayPartitionResult
+{
+public:
+  ThreeWayPartitionResult() = delete;
+  ThreeWayPartitionResult(int num_items)
+    : first_part(num_items)
+    , second_part(num_items)
+    , unselected(num_items)
+  {}
+
+  thrust::device_vector<T> first_part;
+  thrust::device_vector<T> second_part;
+  thrust::device_vector<T> unselected;
+
+  int num_items_in_first_part {};
+  int num_items_in_second_part {};
+  int num_unselected_items {};
+
+  bool operator!=(const ThreeWayPartitionResult<T> &other)
+  {
+    return std::tie(num_items_in_first_part,
+                    num_items_in_second_part,
+                    num_unselected_items,
+                    first_part,
+                    second_part,
+                    unselected) != std::tie(other.num_items_in_first_part,
+                                            other.num_items_in_second_part,
+                                            other.num_unselected_items,
+                                            other.first_part,
+                                            other.second_part,
+                                            other.unselected);
+  }
+};
+
+template <
+  typename FirstPartSelectionOp,
+  typename SecondPartSelectionOp,
+  typename T>
+ThreeWayPartitionResult<T> CUBPartition(
+  FirstPartSelectionOp first_selector,
+  SecondPartSelectionOp second_selector,
+  thrust::device_vector<T> &in)
+{
+  const int num_items = static_cast<int>(in.size());
+  ThreeWayPartitionResult<T> result(num_items);
+
+  T *d_in = thrust::raw_pointer_cast(in.data());
+  T *d_first_part_out = thrust::raw_pointer_cast(result.first_part.data());
+  T *d_second_part_out = thrust::raw_pointer_cast(result.second_part.data());
+  T *d_unselected_out = thrust::raw_pointer_cast(result.unselected.data());
+
+  thrust::device_vector<int> num_selected_out(2);
+  int *d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  std::size_t temp_storage_size {};
+  CubDebugExit(cub::DevicePartition::If(nullptr,
+                                        temp_storage_size,
+                                        d_in,
+                                        d_first_part_out,
+                                        d_second_part_out,
+                                        d_unselected_out,
+                                        d_num_selected_out,
+                                        num_items,
+                                        first_selector,
+                                        second_selector));
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_size);
+  std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  CubDebugExit(cub::DevicePartition::If(d_temp_storage,
+                                        temp_storage_size,
+                                        d_in,
+                                        d_first_part_out,
+                                        d_second_part_out,
+                                        d_unselected_out,
+                                        d_num_selected_out,
+                                        num_items,
+                                        first_selector,
+                                        second_selector));
+
+  thrust::host_vector<int> h_num_selected_out(num_selected_out);
+
+  result.num_items_in_first_part = h_num_selected_out[0];
+  result.num_items_in_second_part = h_num_selected_out[1];
+
+  result.num_unselected_items = num_items
+                              - h_num_selected_out[0]
+                              - h_num_selected_out[1];
+
+  return result;
+}
+
+template <
+  typename FirstPartSelectionOp,
+  typename SecondPartSelectionOp,
+  typename T>
+ThreeWayPartitionResult<T> ThrustPartition(
+  FirstPartSelectionOp first_selector,
+  SecondPartSelectionOp second_selector,
+  thrust::device_vector<T> &in)
+{
+  const int num_items = static_cast<int>(in.size());
+  ThreeWayPartitionResult<T> result(num_items);
+
+  thrust::device_vector<T> intermediate_result(num_items);
+
+  auto intermediate_iterators =
+    thrust::partition_copy(in.begin(),
+                           in.end(),
+                           result.first_part.begin(),
+                           intermediate_result.begin(),
+                           first_selector);
+
+  result.num_items_in_first_part = static_cast<int>(
+    thrust::distance(result.first_part.begin(), intermediate_iterators.first));
+
+  auto final_iterators = thrust::partition_copy(
+    intermediate_result.begin(),
+    intermediate_result.begin() + (num_items - result.num_items_in_first_part),
+    result.second_part.begin(),
+    result.unselected.begin(),
+    second_selector);
+
+  result.num_items_in_second_part = static_cast<int>(
+    thrust::distance(result.second_part.begin(), final_iterators.first));
+
+  result.num_unselected_items = static_cast<int>(
+    thrust::distance(result.unselected.begin(), final_iterators.second));
+
+  return result;
+}
+
+template <typename T>
+void TestEmptyFirstPart(int num_items)
+{
+  thrust::device_vector<T> in(num_items);
+  thrust::sequence(in.begin(), in.end());
+
+  T first_unselected_val = T{0};
+  T first_val_of_second_part = static_cast<T>(num_items / 2);
+
+  LessThan<T> le(first_unselected_val);
+  GreaterOrEqual<T> ge(first_val_of_second_part);
+
+  auto cub_result = CUBPartition(le, ge, in);
+  auto thrust_result = ThrustPartition(le, ge, in);
+
+  AssertEquals(cub_result, thrust_result);
+  AssertEquals(cub_result.num_items_in_first_part, 0);
+}
+
+template <typename T>
+void TestEmptySecondPart(int num_items)
+{
+  thrust::device_vector<T> in(num_items);
+  thrust::sequence(in.begin(), in.end());
+
+  T first_unselected_val = static_cast<T>(num_items / 2);
+  T first_val_of_second_part = T{0}; // empty set for unsigned types
+
+  GreaterOrEqual<T> ge(first_unselected_val);
+  LessThan<T> le(first_val_of_second_part);
+
+  auto cub_result = CUBPartition(ge, le, in);
+  auto thrust_result = ThrustPartition(ge, le, in);
+
+  AssertEquals(cub_result, thrust_result);
+  AssertEquals(cub_result.num_items_in_second_part, 0);
+}
+
+template <typename T>
+void TestEmptyUnselectedPart(int num_items)
+{
+  thrust::device_vector<T> in(num_items);
+  thrust::sequence(in.begin(), in.end());
+
+  T first_unselected_val = static_cast<T>(num_items / 2);
+
+  LessThan<T> le(first_unselected_val);
+  GreaterOrEqual<T> ge(first_unselected_val);
+
+  auto cub_result = CUBPartition(le, ge, in);
+  auto thrust_result = ThrustPartition(le, ge, in);
+
+  AssertEquals(cub_result, thrust_result);
+  AssertEquals(cub_result.num_unselected_items, 0);
+}
+
+template <typename T>
+void TestUnselectedOnly(int num_items)
+{
+  thrust::device_vector<T> in(num_items);
+  thrust::sequence(in.begin(), in.end());
+
+  T first_val_of_second_part = T{0}; // empty set for unsigned types
+  LessThan<T> le(first_val_of_second_part);
+
+  auto cub_result = CUBPartition(le, le, in);
+  auto thrust_result = ThrustPartition(le, le, in);
+
+  AssertEquals(cub_result, thrust_result);
+  AssertEquals(cub_result.num_unselected_items, num_items);
+  AssertEquals(cub_result.num_items_in_first_part, 0);
+  AssertEquals(cub_result.num_items_in_second_part, 0);
+}
+
+template <typename Key,
+          typename Value>
+struct Pair
+{
+  Key key;
+  Value value;
+
+  __host__ __device__ Pair()
+      : key(Key{})
+      , value(Value{})
+  {}
+
+  __host__ __device__ Pair(Key key)
+    : key(key)
+    , value(Value{})
+  {}
+
+  __host__ __device__ Pair(Key key, Value value)
+    : key(key)
+    , value(value)
+  {}
+
+  __host__ __device__ bool operator<(const Pair &b) const
+  {
+    return key < b.key;
+  }
+
+  __host__ __device__ bool operator>=(const Pair &b) const
+  {
+    return key >= b.key;
+  }
+};
+
+template <typename Key, typename Value>
+__device__ __host__ bool operator==(
+  const Pair<Key, Value> &lhs,
+  const Pair<Key, Value> &rhs)
+{
+  return lhs.key == rhs.key && lhs.value == lhs.value;
+}
+
+template <typename ValueT>
+struct CountToPair
+{
+  template <typename OffsetT>
+  __device__ __host__ Pair<ValueT, std::uint32_t> operator()(OffsetT id)
+  {
+    return Pair<ValueT, std::uint32_t>(static_cast<ValueT>(id), id);
+  }
+};
+
+template <typename KeyT>
+void TestStability(int num_items)
+{
+  using T = Pair<KeyT, std::uint32_t>;
+  thrust::device_vector<T> in(num_items);
+
+  thrust::tabulate(in.begin(), in.end(), CountToPair<KeyT>{});
+
+  T first_unselected_val = static_cast<KeyT>(num_items / 3);
+  T first_val_of_second_part = static_cast<KeyT>(2 * num_items / 3);
+
+  LessThan<T> le(first_unselected_val);
+  GreaterOrEqual<T> ge(first_val_of_second_part);
+
+  auto cub_result = CUBPartition(le, ge, in);
+  auto thrust_result = ThrustPartition(le, ge, in);
+
+  AssertEquals(cub_result, thrust_result);
+}
+
+template <typename T>
+void TestReverseIterator(int num_items)
+{
+  int num_items_in_first_part = num_items / 3;
+  int num_unselected_items = 2 * num_items / 3;
+
+  T first_part_val {0};
+  T second_part_val {1};
+  T unselected_part_val {2};
+
+  thrust::device_vector<T> in(num_items, second_part_val);
+  thrust::fill_n(in.begin(), num_items_in_first_part, first_part_val);
+  thrust::fill_n(in.begin() + num_items_in_first_part,
+                 num_unselected_items,
+                 unselected_part_val);
+
+  thrust::shuffle(in.begin(), in.end(), thrust::default_random_engine{});
+
+  thrust::device_vector<T> first_and_unselected_part(num_items);
+
+  EqualTo<T> first_selector{first_part_val};
+  EqualTo<T> second_selector{second_part_val};
+
+  thrust::device_vector<int> num_selected_out(2);
+
+  std::size_t temp_storage_size {};
+  CubDebugExit(cub::DevicePartition::If(nullptr,
+                                        temp_storage_size,
+                                        in.cbegin(),
+                                        first_and_unselected_part.begin(),
+                                        thrust::make_discard_iterator(),
+                                        first_and_unselected_part.rbegin(),
+                                        num_selected_out.begin(),
+                                        num_items,
+                                        first_selector,
+                                        second_selector));
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_size);
+  std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  CubDebugExit(cub::DevicePartition::If(d_temp_storage,
+                                        temp_storage_size,
+                                        in.cbegin(),
+                                        first_and_unselected_part.begin(),
+                                        thrust::make_discard_iterator(),
+                                        first_and_unselected_part.rbegin(),
+                                        num_selected_out.begin(),
+                                        num_items,
+                                        first_selector,
+                                        second_selector));
+
+  thrust::device_vector<int> h_num_selected_out(num_selected_out);
+
+  AssertEquals(h_num_selected_out[0], num_items_in_first_part);
+
+  AssertEquals(thrust::count(first_and_unselected_part.rbegin(),
+                             first_and_unselected_part.rbegin() +
+                               num_unselected_items,
+                             unselected_part_val),
+               num_unselected_items);
+
+  AssertEquals(thrust::count(first_and_unselected_part.begin(),
+                             first_and_unselected_part.begin() +
+                               num_items_in_first_part,
+                             first_part_val),
+               num_items_in_first_part);
+}
+
+template <typename T>
+void TestSingleOutput(int num_items)
+{
+  int num_items_in_first_part = num_items / 3;
+  int num_unselected_items = 2 * num_items / 3;
+  int num_items_in_second_part = num_items - num_items_in_first_part -
+                                 num_unselected_items;
+
+  T first_part_val{0};
+  T second_part_val{1};
+  T unselected_part_val{2};
+
+  thrust::device_vector<T> in(num_items, second_part_val);
+  thrust::fill_n(in.begin(), num_items_in_first_part, first_part_val);
+  thrust::fill_n(in.begin() + num_items_in_first_part,
+                 num_unselected_items,
+                 unselected_part_val);
+
+  thrust::shuffle(in.begin(), in.end(), thrust::default_random_engine{});
+
+  thrust::device_vector<T> output(num_items);
+
+  EqualTo<T> first_selector{first_part_val};
+  EqualTo<T> second_selector{second_part_val};
+
+  thrust::device_vector<int> num_selected_out(2);
+
+  std::size_t temp_storage_size{};
+  CubDebugExit(cub::DevicePartition::If(nullptr,
+                                        temp_storage_size,
+                                        in.cbegin(),
+                                        output.begin(),
+                                        output.begin() + num_items_in_first_part,
+                                        output.rbegin(),
+                                        num_selected_out.begin(),
+                                        num_items,
+                                        first_selector,
+                                        second_selector));
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_size);
+  std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  CubDebugExit(cub::DevicePartition::If(d_temp_storage,
+                                        temp_storage_size,
+                                        in.cbegin(),
+                                        output.begin(),
+                                        output.begin() + num_items_in_first_part,
+                                        output.rbegin(),
+                                        num_selected_out.begin(),
+                                        num_items,
+                                        first_selector,
+                                        second_selector));
+
+  thrust::device_vector<int> h_num_selected_out(num_selected_out);
+
+  AssertEquals(h_num_selected_out[0], num_items_in_first_part);
+  AssertEquals(h_num_selected_out[1], num_items_in_second_part);
+
+  AssertEquals(thrust::count(output.rbegin(),
+                             output.rbegin() + num_unselected_items,
+                             unselected_part_val),
+               num_unselected_items);
+
+  AssertEquals(thrust::count(output.begin(),
+                             output.begin() + num_items_in_first_part,
+                             first_part_val),
+               num_items_in_first_part);
+
+  AssertEquals(thrust::count(output.begin() + num_items_in_first_part,
+                             output.begin() + num_items_in_first_part +
+                               num_items_in_second_part,
+                             second_part_val),
+               num_items_in_second_part);
+}
+
+template <typename T>
+void TestNumItemsDependent(int num_items)
+{
+  TestStability<T>(num_items);
+  TestEmptyFirstPart<T>(num_items);
+  TestEmptySecondPart<T>(num_items);
+  TestEmptyUnselectedPart<T>(num_items);
+  TestUnselectedOnly<T>(num_items);
+  TestReverseIterator<T>(num_items);
+  TestSingleOutput<T>(num_items);
+}
+
+template <typename T>
+void TestNumItemsDependent()
+{
+  for (int num_items = 1; num_items < 1000000; num_items <<= 2)
+  {
+    TestNumItemsDependent<T>(num_items);
+    TestNumItemsDependent<T>(num_items + 31);
+  }
+}
+
+template <typename T>
+void Test()
+{
+  TestEmpty<T>();
+  TestNumItemsDependent<T>();
+}
+
+int main(int argc, char **argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // NVBug 4136386
+  // Test<std::uint8_t>();
+  // Test<std::uint16_t>();
+  Test<std::uint32_t>();
+  Test<std::uint64_t>();
+
+  return 0;
+}
diff --git a/include/cub/test/test_grid_barrier.cu b/include/cub/test/test_grid_barrier.cu
new file mode 100644
index 0000000..711a5f5
--- /dev/null
+++ b/include/cub/test/test_grid_barrier.cu
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for software global barrier throughput
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/grid/grid_barrier.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Kernel that iterates through the specified number of software global barriers
+ */
+__global__ void Kernel(
+    GridBarrier global_barrier,
+    int iterations)
+{
+    for (int i = 0; i < iterations; i++)
+    {
+        global_barrier.Sync();
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    cudaError_t retval = cudaSuccess;
+
+    // Defaults
+    int iterations = 10000;
+    int block_size = 128;
+    int grid_size = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Get args
+    args.GetCmdLineArgument("i", iterations);
+    args.GetCmdLineArgument("grid-size", grid_size);
+    args.GetCmdLineArgument("block-size", block_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--i=<iterations>]"
+            "[--grid-size<grid-size>]"
+            "[--block-size<block-size>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version = 0;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Get SM properties
+    int sm_count, max_block_threads, max_sm_occupancy;
+    CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
+
+    // Compute grid size and occupancy
+    int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
+
+    if (grid_size == -1)
+    {
+        grid_size = occupancy * sm_count;
+    }
+    else
+    {
+        occupancy = grid_size / sm_count;
+    }
+
+    printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
+        grid_size, block_size, occupancy);
+    fflush(stdout);
+
+    // Init global barrier
+    GridBarrierLifetime global_barrier;
+    global_barrier.Setup(grid_size);
+
+    // Time kernel
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
+    gpu_timer.Stop();
+
+    retval = CubDebug(cudaDeviceSynchronize());
+
+    // Output timing results
+    float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
+    printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
+        iterations,
+        gpu_timer.ElapsedMillis(),
+        avg_elapsed);
+
+    return retval;
+}
diff --git a/include/cub/test/test_iterator.cu b/include/cub/test/test_iterator.cu
new file mode 100644
index 0000000..32e5b66
--- /dev/null
+++ b/include/cub/test/test_iterator.cu
@@ -0,0 +1,544 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of iterator utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include <cub/util_type.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+template <typename T>
+struct TransformOp
+{
+    // Increment transform
+    __host__ __device__ __forceinline__ T operator()(T input) const
+    {
+        T addend;
+        InitValue(INTEGER_SEED, addend, 1);
+        return input + addend;
+    }
+};
+
+struct SelectOp
+{
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(T input)
+    {
+        return true;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Test random access input iterator
+ */
+template <
+    typename InputIteratorT,
+    typename T>
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    T                 *d_out,
+    InputIteratorT    *d_itrs)
+{
+    d_out[0] = *d_in;               // Value at offset 0
+    d_out[1] = d_in[100];           // Value at offset 100
+    d_out[2] = *(d_in + 1000);      // Value at offset 1000
+    d_out[3] = *(d_in + 10000);     // Value at offset 10000
+
+    d_in++;
+    d_out[4] = d_in[0];             // Value at offset 1
+
+    d_in += 20;
+    d_out[5] = d_in[0];             // Value at offset 21
+    d_itrs[0] = d_in;               // Iterator at offset 21
+
+    d_in -= 10;
+    d_out[6] = d_in[0];             // Value at offset 11;
+
+    d_in -= 11;
+    d_out[7] = d_in[0];             // Value at offset 0
+    d_itrs[1] = d_in;               // Iterator at offset 0
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Run iterator test on device
+ */
+template <
+    typename        InputIteratorT,
+    typename        T,
+    int             TEST_VALUES>
+void Test(
+    InputIteratorT  d_in,
+    T               (&h_reference)[TEST_VALUES])
+{
+    // Allocate device arrays
+    T                 *d_out    = NULL;
+    InputIteratorT    *d_itrs   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
+
+    int compare;
+
+    // Run unguarded kernel
+    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 21
+    InputIteratorT h_itr = d_in + 21;
+    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 0
+    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs));
+}
+
+
+/**
+ * Test constant iterator
+ */
+template <typename T>
+void TestConstant(T base)
+{
+    printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    T h_reference[8] = {base, base, base, base, base, base, base, base};
+    ConstantInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+}
+
+
+/**
+ * Test counting iterator
+ */
+template <typename T>
+void TestCounting(T base)
+{
+    printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = static_cast<T>(base + 0);          // Value at offset 0
+    h_reference[1] = static_cast<T>(base + 100);        // Value at offset 100
+    h_reference[2] = static_cast<T>(base + 1000);       // Value at offset 1000
+    h_reference[3] = static_cast<T>(base + 10000);      // Value at offset 10000
+    h_reference[4] = static_cast<T>(base + 1);          // Value at offset 1
+    h_reference[5] = static_cast<T>(base + 21);         // Value at offset 21
+    h_reference[6] = static_cast<T>(base + 11);         // Value at offset 11
+    h_reference[7] = static_cast<T>(base + 0);          // Value at offset 0;
+
+    CountingInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+}
+
+
+/**
+ * Test modified iterator
+ */
+template <typename T, typename CastT>
+void TestModified()
+{
+    printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    Test(CacheModifiedInputIterator<LOAD_DEFAULT, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CA, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CS, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CV, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_LDG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_VOLATILE, T>((CastT*) d_data), h_reference);
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test transform iterator
+ */
+template <typename T, typename CastT>
+void TestTransform()
+{
+    printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    TransformInputIterator<T, TransformOp<T>, CastT*> d_itr((CastT*) d_data, op);
+    Test(d_itr, h_reference);
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test tex-obj texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexObj()
+{
+    printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES          = 11000;
+    const unsigned int DUMMY_OFFSET         = 500;
+    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (unsigned int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind obj-based test iterator
+    TexObjInputIterator<T> d_obj_itr;
+    CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    Test(d_obj_itr, h_reference);
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
+}
+
+/**
+ * Test texture transform iterator
+ */
+template <typename T, typename CastT>
+void TestTexTransform()
+{
+    printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    // Create and bind texture iterator
+    typedef TexObjInputIterator<T> TextureIterator;
+
+    TextureIterator d_tex_itr;
+    CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create transform iterator
+    TransformInputIterator<T, TransformOp<T>, TextureIterator> xform_itr(d_tex_itr, op);
+
+    Test(xform_itr, h_reference);
+
+    CubDebugExit(d_tex_itr.UnbindTexture());
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+/**
+ * Run non-integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<false> /* is_integer */)
+{
+    TestModified<T, CastT>();
+    TestTransform<T, CastT>();
+    TestTexObj<T, CastT>();
+    TestTexTransform<T, CastT>();
+}
+
+/**
+ * Run integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<true> /* is_integer */)
+{
+    TestConstant<T>(0);
+    TestConstant<T>(99);
+
+    TestCounting<T>(0);
+    TestCounting<T>(99);
+
+    // Run non-integer tests
+    Test<T, CastT>(Int2Type<false>());
+}
+
+/**
+ * Run tests
+ */
+template <typename T>
+void Test()
+{
+    enum {
+        IS_INTEGER = (Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER)
+    };
+
+    // Test non-const type
+    Test<T, T>(Int2Type<IS_INTEGER>());
+
+    // Test non-const type
+    Test<T, const T>(Int2Type<IS_INTEGER>());
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version = 0;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // Evaluate different data types
+    Test<signed char>();
+    Test<short>();
+    Test<int>();
+    Test<long>();
+    Test<long long>();
+    Test<float>();
+    Test<double>();
+
+    Test<char2>();
+    Test<short2>();
+    Test<int2>();
+    Test<long2>();
+    Test<longlong2>();
+    Test<float2>();
+    Test<double2>();
+
+    Test<char3>();
+    Test<short3>();
+    Test<int3>();
+    Test<long3>();
+    Test<longlong3>();
+    Test<float3>();
+    Test<double3>();
+
+    Test<char4>();
+    Test<short4>();
+    Test<int4>();
+    Test<long4>();
+    Test<longlong4>();
+    Test<float4>();
+    Test<double4>();
+
+    Test<TestFoo>();
+    Test<TestBar>();
+
+    printf("\nTest complete\n"); fflush(stdout);
+
+    return 0;
+}
+
+
+
diff --git a/include/cub/test/test_iterator_deprecated.cu b/include/cub/test/test_iterator_deprecated.cu
new file mode 100644
index 0000000..862af1d
--- /dev/null
+++ b/include/cub/test/test_iterator_deprecated.cu
@@ -0,0 +1,306 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of iterator utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+// This file tests deprecated CUB APIs. Silence deprecation warnings:
+#define CUB_IGNORE_DEPRECATED_API
+
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <iterator>
+#include <cstdio>
+#include <typeinfo>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Test random access input iterator
+ */
+template <
+    typename InputIteratorT,
+    typename T>
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    T                 *d_out,
+    InputIteratorT    *d_itrs)
+{
+    d_out[0] = *d_in;               // Value at offset 0
+    d_out[1] = d_in[100];           // Value at offset 100
+    d_out[2] = *(d_in + 1000);      // Value at offset 1000
+    d_out[3] = *(d_in + 10000);     // Value at offset 10000
+
+    d_in++;
+    d_out[4] = d_in[0];             // Value at offset 1
+
+    d_in += 20;
+    d_out[5] = d_in[0];             // Value at offset 21
+    d_itrs[0] = d_in;               // Iterator at offset 21
+
+    d_in -= 10;
+    d_out[6] = d_in[0];             // Value at offset 11;
+
+    d_in -= 11;
+    d_out[7] = d_in[0];             // Value at offset 0
+    d_itrs[1] = d_in;               // Iterator at offset 0
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Run iterator test on device
+ */
+template <
+    typename        InputIteratorT,
+    typename        T,
+    int             TEST_VALUES>
+void Test(
+    InputIteratorT  d_in,
+    T               (&h_reference)[TEST_VALUES])
+{
+    // Allocate device arrays
+    T                 *d_out    = NULL;
+    InputIteratorT    *d_itrs   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
+
+    int compare;
+
+    // Run unguarded kernel
+    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 21
+    InputIteratorT h_itr = d_in + 21;
+    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 0
+    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_out)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_out));
+    }
+    if (d_itrs)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_itrs));
+    }
+}
+
+/**
+ * Test tex-ref texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexRef()
+{
+    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES                   = 11000;
+    constexpr unsigned int DUMMY_OFFSET         = 500;
+    constexpr unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind ref-based test iterator
+    TexRefInputIterator<T, __LINE__> d_ref_itr;
+    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create and bind dummy iterator of same type to check with interferance
+    TexRefInputIterator<T, __LINE__> d_ref_itr2;
+    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+
+    Test(d_ref_itr, h_reference);
+
+    CubDebugExit(d_ref_itr.UnbindTexture());
+    CubDebugExit(d_ref_itr2.UnbindTexture());
+
+    if (h_data)
+    {
+        delete[] h_data;
+    }
+    if (d_data)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_data));
+    }
+    if (d_dummy)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_dummy));
+    }
+}
+
+/**
+ * Run non-integer tests
+ */
+template <typename T, typename CastT>
+void Test()
+{
+    TestTexRef<T, CastT>();
+}
+
+/**
+ * Run tests
+ */
+template <typename T>
+void Test()
+{
+    // Test non-const type
+    Test<T, T>();
+
+    // Test non-const type
+    Test<T, const T>();
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Evaluate different data types
+    Test<signed char>();
+    Test<short>();
+    Test<int>();
+    Test<long>();
+    Test<long long>();
+    Test<float>();
+    Test<double>();
+
+    Test<char2>();
+    Test<short2>();
+    Test<int2>();
+    Test<long2>();
+    Test<longlong2>();
+    Test<float2>();
+    Test<double2>();
+
+    Test<char3>();
+    Test<short3>();
+    Test<int3>();
+    Test<long3>();
+    Test<longlong3>();
+    Test<float3>();
+    Test<double3>();
+
+    Test<char4>();
+    Test<short4>();
+    Test<int4>();
+    Test<long4>();
+    Test<longlong4>();
+    Test<float4>();
+    Test<double4>();
+
+    Test<TestFoo>();
+    Test<TestBar>();
+
+    printf("\nTest complete\n");
+    fflush(stdout);
+
+    return 0;
+}
diff --git a/include/cub/test/test_namespace_wrapped.cu b/include/cub/test/test_namespace_wrapped.cu
new file mode 100644
index 0000000..58e1644
--- /dev/null
+++ b/include/cub/test/test_namespace_wrapped.cu
@@ -0,0 +1,76 @@
+// Wrap thrust and cub in different enclosing namespaces
+// (In practice, you probably want these to be the same, in which case just
+// set THRUST_CUB_WRAPPED_NAMESPACE to set both).
+#define THRUST_WRAPPED_NAMESPACE wrap_thrust
+#define CUB_WRAPPED_NAMESPACE    wrap_cub
+
+// Enable error checking:
+#define CUB_STDERR
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_debug.cuh>
+
+#include "test_util.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+// Test that we can use a few common utilities and algorithms from wrapped
+// Thrust/CUB namespaces at runtime. More extensive testing is performed by the
+// header tests and the check_namespace.cmake test.
+int main(int argc, char **argv)
+{
+  CommandLineArgs args(argc, argv);
+  CubDebugExit(args.DeviceInit());
+
+  const std::size_t n = 2048;
+
+  // Fill a vector with random data:
+  ::wrap_thrust::thrust::host_vector<int> h_input(n);
+  for (auto &val : h_input)
+  {
+    RandomBits(val);
+  }
+
+  // Test the qualifier macro:
+  THRUST_NS_QUALIFIER::device_vector<int> d_input(h_input);
+  THRUST_NS_QUALIFIER::device_vector<int> d_output(n);
+
+  std::size_t temp_storage_bytes{};
+
+  // Sort with DeviceRadixSort:
+  auto error = ::wrap_cub::cub::DeviceRadixSort::SortKeys(
+    nullptr,
+    temp_storage_bytes,
+    ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
+    ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
+    static_cast<std::size_t>(n));
+
+  CubDebugExit(error);
+
+  ::wrap_thrust::thrust::device_vector<std::uint8_t> temp_storage(
+    temp_storage_bytes);
+
+  // Test the CUB qualifier macro:
+  error = CUB_NS_QUALIFIER::DeviceRadixSort::SortKeys(
+    ::wrap_thrust::thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
+    ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
+    static_cast<std::size_t>(n));
+
+  CubDebugExit(error);
+
+  // Verify output:
+  if (!::wrap_thrust::thrust::is_sorted(d_output.cbegin(), d_output.cend()))
+  {
+    std::cerr << "Output is not sorted!\n";
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/include/cub/test/test_temporary_storage_layout.cu b/include/cub/test/test_temporary_storage_layout.cu
new file mode 100644
index 0000000..2f734fa
--- /dev/null
+++ b/include/cub/test/test_temporary_storage_layout.cu
@@ -0,0 +1,219 @@
+/*******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include "cub/detail/temporary_storage.cuh"
+#include "test_util.h"
+
+#include <memory>
+
+template <int Items>
+std::size_t GetTemporaryStorageSize(std::size_t (&sizes)[Items])
+{
+  void *pointers[Items]{};
+  std::size_t temp_storage_bytes{};
+  CubDebugExit(
+    cub::AliasTemporaries(nullptr, temp_storage_bytes, pointers, sizes));
+  return temp_storage_bytes;
+}
+
+std::size_t GetActualZero()
+{
+  std::size_t sizes[1]{};
+
+  return GetTemporaryStorageSize(sizes);
+}
+
+template <int StorageSlots>
+void TestEmptyStorage()
+{
+  cub::detail::temporary_storage::layout<StorageSlots> temporary_storage;
+  AssertEquals(temporary_storage.get_size(), GetActualZero());
+}
+
+template <int StorageSlots>
+void TestPartiallyFilledStorage()
+{
+  using target_type = std::uint64_t;
+
+  constexpr std::size_t target_elements    = 42;
+  constexpr std::size_t full_slot_elements = target_elements *
+                                             sizeof(target_type);
+  constexpr std::size_t empty_slot_elements{};
+
+  cub::detail::temporary_storage::layout<StorageSlots> temporary_storage;
+
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>>
+    arrays[StorageSlots];
+  std::size_t sizes[StorageSlots]{};
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    auto slot = temporary_storage.get_slot(slot_id);
+
+    const std::size_t elements = slot_id % 2 == 0 ? full_slot_elements
+                                                  : empty_slot_elements;
+
+    sizes[slot_id] = elements * sizeof(target_type);
+    arrays[slot_id].reset(
+      new cub::detail::temporary_storage::alias<target_type>(
+        slot->template create_alias<target_type>(elements)));
+  }
+
+  const std::size_t temp_storage_bytes = temporary_storage.get_size();
+
+  std::unique_ptr<std::uint8_t[]> temp_storage(
+    new std::uint8_t[temp_storage_bytes]);
+
+  temporary_storage.map_to_buffer(temp_storage.get(), temp_storage_bytes);
+
+  AssertEquals(temp_storage_bytes, GetTemporaryStorageSize(sizes));
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    if (slot_id % 2 == 0)
+    {
+      AssertTrue(arrays[slot_id]->get() != nullptr);
+    }
+    else
+    {
+      AssertTrue(arrays[slot_id]->get() == nullptr);
+    }
+  }
+}
+
+template <int StorageSlots>
+void TestGrow()
+{
+  using target_type = std::uint64_t;
+
+  constexpr std::size_t target_elements_number = 42;
+
+  cub::detail::temporary_storage::layout<StorageSlots> preset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>>
+    preset_arrays[StorageSlots];
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    preset_arrays[slot_id].reset(
+      new cub::detail::temporary_storage::alias<target_type>(
+        preset_layout.get_slot(slot_id)->template create_alias<target_type>(
+          target_elements_number)));
+  }
+
+  cub::detail::temporary_storage::layout<StorageSlots> postset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>>
+    postset_arrays[StorageSlots];
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    postset_arrays[slot_id].reset(
+      new cub::detail::temporary_storage::alias<target_type>(
+        postset_layout.get_slot(slot_id)->template create_alias<target_type>()));
+    postset_arrays[slot_id]->grow(target_elements_number);
+  }
+
+  AssertEquals(preset_layout.get_size(), postset_layout.get_size());
+
+  const std::size_t tmp_storage_bytes = preset_layout.get_size();
+  std::unique_ptr<std::uint8_t[]> temp_storage(
+    new std::uint8_t[tmp_storage_bytes]);
+
+  preset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+  postset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    AssertEquals(postset_arrays[slot_id]->get(), preset_arrays[slot_id]->get());
+  }
+}
+
+template <int StorageSlots>
+void TestDoubleGrow()
+{
+  using target_type = std::uint64_t;
+
+  constexpr std::size_t target_elements_number = 42;
+
+  cub::detail::temporary_storage::layout<StorageSlots> preset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>>
+    preset_arrays[StorageSlots];
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    preset_arrays[slot_id].reset(
+      new cub::detail::temporary_storage::alias<target_type>(
+        preset_layout.get_slot(slot_id)->template create_alias<target_type>(
+          2 * target_elements_number)));
+  }
+
+  cub::detail::temporary_storage::layout<StorageSlots> postset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>>
+    postset_arrays[StorageSlots];
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    postset_arrays[slot_id].reset(
+      new cub::detail::temporary_storage::alias<target_type>(
+        postset_layout.get_slot(slot_id)->template create_alias<target_type>(
+          target_elements_number)));
+    postset_arrays[slot_id]->grow(2 * target_elements_number);
+  }
+
+  AssertEquals(preset_layout.get_size(), postset_layout.get_size());
+
+  const std::size_t tmp_storage_bytes = preset_layout.get_size();
+  std::unique_ptr<std::uint8_t[]> temp_storage(
+    new std::uint8_t[tmp_storage_bytes]);
+
+  preset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+  postset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    AssertEquals(postset_arrays[slot_id]->get(), preset_arrays[slot_id]->get());
+  }
+}
+
+template <int StorageSlots>
+void Test()
+{
+  TestEmptyStorage<StorageSlots>();
+  TestPartiallyFilledStorage<StorageSlots>();
+  TestGrow<StorageSlots>();
+  TestDoubleGrow<StorageSlots>();
+}
+
+int main()
+{
+  Test<1>();
+  Test<4>();
+  Test<42>();
+}
diff --git a/include/cub/test/test_thread_operators.cu b/include/cub/test/test_thread_operators.cu
new file mode 100644
index 0000000..44cc3a8
--- /dev/null
+++ b/include/cub/test/test_thread_operators.cu
@@ -0,0 +1,259 @@
+/*******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "test_util.h"
+
+#include <cub/thread/thread_operators.cuh>
+
+template <class T>
+T Make(int val)
+{
+  return T{val};
+}
+
+template <bool>
+class BaseT
+{
+protected:
+  int m_val{};
+
+public:
+  BaseT(int val)
+      : m_val{val}
+  {}
+};
+
+template <>
+class BaseT<true>
+{
+protected:
+  int m_val{};
+
+public:
+  BaseT(int val)
+      : m_val{val}
+  {}
+
+  __host__ __device__ operator int() const { return m_val; }
+};
+
+#define CUSTOM_TYPE_FACTORY(NAME, RT, OP, CONVERTABLE)                         \
+  class Custom##NAME##T : public BaseT<CONVERTABLE>                            \
+  {                                                                            \
+    explicit Custom##NAME##T(int val)                                          \
+        : BaseT<CONVERTABLE>(val)                                              \
+    {}                                                                         \
+                                                                               \
+    friend Custom##NAME##T Make<Custom##NAME##T>(int);                         \
+                                                                               \
+  public:                                                                      \
+    __host__ __device__ RT operator OP(int val) const                          \
+    {                                                                          \
+      return m_val OP val;                                                     \
+    }                                                                          \
+  }
+
+//                  NAME  RT    OP  CONVERTABLE
+CUSTOM_TYPE_FACTORY(Eq,   bool, ==, false);
+CUSTOM_TYPE_FACTORY(Ineq, bool, !=, false);
+CUSTOM_TYPE_FACTORY(Sum,  int,  +,  false);
+CUSTOM_TYPE_FACTORY(Diff, int,  -,  false);
+CUSTOM_TYPE_FACTORY(Div,  int,  /,  false);
+CUSTOM_TYPE_FACTORY(Gt,   bool, >,  true);
+CUSTOM_TYPE_FACTORY(Lt,   bool, <,  true);
+
+void TestEquality()
+{
+  cub::Equality op{}; 
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, const_magic_val), true);
+  AssertEquals(op(const_magic_val, magic_val), true);
+  AssertEquals(op(const_magic_val, magic_val + 1), false);
+
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val), true);
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val + 1), false);
+}
+
+void TestInequality()
+{
+  cub::Inequality op{}; 
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, const_magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val + 1), true);
+
+  AssertEquals(op(Make<CustomIneqT>(magic_val), magic_val), false);
+  AssertEquals(op(Make<CustomIneqT>(magic_val), magic_val + 1), true);
+}
+
+void TestInequalityWrapper()
+{
+  cub::Equality wrapped_op{}; 
+  cub::InequalityWrapper<cub::Equality> op{wrapped_op};
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, const_magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val), false);
+  AssertEquals(op(const_magic_val, magic_val + 1), true);
+
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val), false);
+  AssertEquals(op(Make<CustomEqT>(magic_val), magic_val + 1), true);
+}
+
+#define CUSTOM_SYNC_T(NAME, RT, OP)                                            \
+  struct Custom ## NAME ## Sink                                                \
+  {                                                                            \
+    template <class T>                                                         \
+    __host__ __device__ RT operator OP (T &&) const                            \
+    {                                                                          \
+      return RT{};                                                             \
+    }                                                                          \
+  }
+
+CUSTOM_SYNC_T(SumInt, int, +);
+CUSTOM_SYNC_T(SumCustomInt, CustomSumIntSink, +);
+
+CUSTOM_SYNC_T(DiffInt, int, -);
+CUSTOM_SYNC_T(DiffCustomInt, CustomDiffIntSink, -);
+
+CUSTOM_SYNC_T(DivInt, int, /);
+CUSTOM_SYNC_T(DivCustomInt, CustomDivIntSink, /);
+
+template <class ExpectedT, class ActualT>
+void StaticSame()
+{
+  static_assert(std::is_same<ExpectedT, ActualT>::value, "shall match");
+}
+
+void TestSum()
+{
+  cub::Sum op{};
+
+  const int const_magic_val = 40;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 42);
+  AssertEquals(op(magic_val, 2), 42);
+  AssertEquals(op(Make<CustomSumT>(magic_val), 2), 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomSumIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomSumCustomIntSink{}, 1.0)), CustomSumIntSink>();
+}
+
+void TestDifference()
+{
+  cub::Difference op{};
+
+  const int const_magic_val = 44;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 42);
+  AssertEquals(op(magic_val, 2), 42);
+
+  AssertEquals(op(Make<CustomDiffT>(magic_val), 2), 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomDiffIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomDiffCustomIntSink{}, 1.0)), CustomDiffIntSink>();
+}
+
+void TestDivision()
+{
+  cub::Division op{};
+
+  const int const_magic_val = 44;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 22);
+  AssertEquals(op(magic_val, 2), 22);
+
+  AssertEquals(op(Make<CustomDivT>(magic_val), 2), 22);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomDivIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomDivCustomIntSink{}, 1.0)), CustomDivIntSink>();
+}
+
+void TestMax()
+{
+  cub::Max op{};
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 42);
+  AssertEquals(op(magic_val, 2), 42);
+
+  AssertEquals(op(2, Make<CustomGtT>(magic_val)), 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(1, Make<CustomGtT>(magic_val))), int>();
+}
+
+void TestMin()
+{
+  cub::Min op{};
+
+  const int const_magic_val = 42;
+  int magic_val = const_magic_val;
+
+  AssertEquals(op(const_magic_val, 2), 2);
+  AssertEquals(op(magic_val, 2), 2);
+
+  AssertEquals(op(2, Make<CustomLtT>(magic_val)), 2);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(1, Make<CustomLtT>(magic_val))), int>();
+}
+
+int main()
+{
+  TestEquality();
+  TestInequality();
+  TestInequalityWrapper();
+  TestSum();
+  TestDifference();
+  TestDivision();
+  TestMax();
+  TestMin();
+
+  return 0;
+}
diff --git a/include/cub/test/test_thread_sort.cu b/include/cub/test/test_thread_sort.cu
new file mode 100644
index 0000000..168e8cf
--- /dev/null
+++ b/include/cub/test/test_thread_sort.cu
@@ -0,0 +1,150 @@
+/*******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "test_util.h"
+#include "cub/thread/thread_sort.cuh"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+#include <thrust/random.h>
+
+
+struct CustomLess
+{
+  template <typename DataType>
+  __host__ __device__ bool operator()(DataType &lhs, DataType &rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+
+template <typename KeyT,
+          typename ValueT,
+          int ItemsPerThread>
+__global__ void kernel(const KeyT *keys_in,
+                       KeyT *keys_out,
+                       const ValueT *values_in,
+                       ValueT *values_out)
+{
+  KeyT thread_keys[ItemsPerThread];
+  KeyT thread_values[ItemsPerThread];
+
+  const auto thread_offset = ItemsPerThread * threadIdx.x;
+  keys_in += thread_offset;
+  keys_out += thread_offset;
+  values_in += thread_offset;
+  values_out += thread_offset;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_keys[item] = keys_in[item];
+    thread_values[item] = values_in[item];
+  }
+
+  cub::StableOddEvenSort(thread_keys, thread_values, CustomLess{});
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    keys_out[item] = thread_keys[item];
+    values_out[item] = thread_values[item];
+  }
+}
+
+
+template <typename KeyT,
+          typename ValueT,
+          int ItemsPerThread>
+void Test()
+{
+  const unsigned int threads_in_block = 1024;
+  const unsigned int elements = threads_in_block * ItemsPerThread;
+
+  thrust::default_random_engine re;
+  thrust::device_vector<std::uint8_t> data_source(elements);
+
+  for (int iteration = 0; iteration < 10; iteration++)
+  {
+    thrust::sequence(data_source.begin(), data_source.end());
+    thrust::shuffle(data_source.begin(), data_source.end(), re);
+    thrust::device_vector<KeyT> in_keys(data_source);
+    thrust::device_vector<KeyT> out_keys(elements);
+
+    thrust::shuffle(data_source.begin(), data_source.end(), re);
+    thrust::device_vector<ValueT> in_values(data_source);
+    thrust::device_vector<ValueT> out_values(elements);
+
+    thrust::host_vector<KeyT> host_keys(in_keys);
+    thrust::host_vector<ValueT> host_values(in_values);
+
+    kernel<KeyT, ValueT, ItemsPerThread><<<1, threads_in_block>>>(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()));
+
+    for (unsigned int tid = 0; tid < threads_in_block; tid++)
+    {
+      const auto thread_begin = tid * ItemsPerThread;
+      const auto thread_end = thread_begin + ItemsPerThread;
+
+      thrust::sort_by_key(host_keys.begin() + thread_begin,
+                          host_keys.begin() + thread_end,
+                          host_values.begin() + thread_begin,
+                          CustomLess{});
+    }
+
+    AssertEquals(host_keys, out_keys);
+    AssertEquals(host_values, out_values);
+  }
+}
+
+
+template <typename KeyT,
+          typename ValueT>
+void Test()
+{
+  Test<KeyT, ValueT, 2>();
+  Test<KeyT, ValueT, 3>();
+  Test<KeyT, ValueT, 4>();
+  Test<KeyT, ValueT, 5>();
+  Test<KeyT, ValueT, 7>();
+  Test<KeyT, ValueT, 8>();
+  Test<KeyT, ValueT, 9>();
+  Test<KeyT, ValueT, 11>();
+}
+
+int main()
+{
+  Test<std::uint32_t, std::uint32_t>();
+  Test<std::uint32_t, std::uint64_t>();
+
+  return 0;
+}
diff --git a/include/cub/test/test_util.h b/include/cub/test/test_util.h
new file mode 100644
index 0000000..c4093f4
--- /dev/null
+++ b/include/cub/test/test_util.h
@@ -0,0 +1,1655 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+#pragma once
+
+#if defined(_WIN32) || defined(_WIN64)
+    #include <windows.h>
+    #undef small            // Windows is terrible for polluting macro namespace
+#else
+    #include <sys/resource.h>
+#endif
+
+#include <cstdio>
+#include <cfloat>
+#include <cmath>
+#include <cstddef>
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <limits>
+
+#include "mersenne.h"
+#include "half.h"
+#include "bfloat16.h"
+
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+
+#include "test_util_vec.h"
+
+#include <nv/target>
+
+/******************************************************************************
+ * Type conversion macros
+ ******************************************************************************/
+
+/**
+ * Return a value of type `T` with the same bitwise representation of `in`.
+ * Types `T` and `U` must be the same size.
+ */
+template <typename T, typename U>
+T SafeBitCast(const U& in)
+{
+  static_assert(sizeof(T) == sizeof(U), "Types must be same size.");
+  T out;
+  memcpy(&out, &in, sizeof(T));
+  return out;
+}
+
+/******************************************************************************
+ * Assertion macros
+ ******************************************************************************/
+
+/**
+ * Assert equals
+ */
+#define AssertEquals(a, b)                                                     \
+  if ((a) != (b))                                                              \
+  {                                                                            \
+    std::cerr << "\n"                                                          \
+              << __FILE__ << ": " << __LINE__                                  \
+              << ": AssertEquals(" #a ", " #b ") failed.\n";                   \
+    exit(1);                                                                   \
+  }
+
+#define AssertTrue(a)                                                          \
+  if (!(a))                                                                    \
+  {                                                                            \
+    std::cerr << "\n"                                                          \
+              << __FILE__ << ": " << __LINE__                                  \
+              << ": AssertTrue(" #a ") failed.\n";                             \
+    exit(1);                                                                   \
+  }
+
+/******************************************************************************
+ * Command-line parsing functionality
+ ******************************************************************************/
+
+/**
+ * Utility for parsing command line arguments
+ */
+struct CommandLineArgs
+{
+
+    std::vector<std::string>    keys;
+    std::vector<std::string>    values;
+    std::vector<std::string>    args;
+    cudaDeviceProp              deviceProp;
+    float                       device_giga_bandwidth;
+    std::size_t                 device_free_physmem;
+    std::size_t                 device_total_physmem;
+
+    /**
+     * Constructor
+     */
+    CommandLineArgs(int argc, char **argv) :
+        keys(10),
+        values(10)
+    {
+        using namespace std;
+
+        // Initialize mersenne generator
+        unsigned int mersenne_init[4]=  {0x123, 0x234, 0x345, 0x456};
+        mersenne::init_by_array(mersenne_init, 4);
+
+        for (int i = 1; i < argc; i++)
+        {
+            string arg = argv[i];
+
+            if ((arg[0] != '-') || (arg[1] != '-'))
+            {
+                args.push_back(arg);
+                continue;
+            }
+
+            string::size_type pos;
+            string key, val;
+            if ((pos = arg.find('=')) == string::npos) {
+                key = string(arg, 2, arg.length() - 2);
+                val = "";
+            } else {
+                key = string(arg, 2, pos - 2);
+                val = string(arg, pos + 1, arg.length() - 1);
+            }
+
+            keys.push_back(key);
+            values.push_back(val);
+        }
+    }
+
+
+    /**
+     * Checks whether a flag "--<flag>" is present in the commandline
+     */
+    bool CheckCmdLineFlag(const char* arg_name)
+    {
+        using namespace std;
+
+        for (std::size_t i = 0; i < keys.size(); ++i)
+        {
+            if (keys[i] == string(arg_name))
+                return true;
+        }
+        return false;
+    }
+
+
+    /**
+     * Returns number of naked (non-flag and non-key-value) commandline parameters
+     */
+    template <typename T>
+    int NumNakedArgs()
+    {
+        return args.size();
+    }
+
+
+    /**
+     * Returns the commandline parameter for a given index (not including flags)
+     */
+    template <typename T>
+    void GetCmdLineArgument(std::size_t index, T &val)
+    {
+        using namespace std;
+        if (index < args.size()) {
+            istringstream str_stream(args[index]);
+            str_stream >> val;
+        }
+    }
+
+    /**
+     * Returns the value specified for a given commandline parameter --<flag>=<value>
+     */
+    template <typename T>
+    void GetCmdLineArgument(const char *arg_name, T &val)
+    {
+        using namespace std;
+
+        for (std::size_t i = 0; i < keys.size(); ++i)
+        {
+            if (keys[i] == string(arg_name))
+            {
+                istringstream str_stream(values[i]);
+                str_stream >> val;
+            }
+        }
+    }
+
+
+    /**
+     * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
+     */
+    template <typename T>
+    void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
+    {
+        using namespace std;
+
+        if (CheckCmdLineFlag(arg_name))
+        {
+            // Clear any default values
+            vals.clear();
+
+            // Recover from multi-value string
+            for (std::size_t i = 0; i < keys.size(); ++i)
+            {
+                if (keys[i] == string(arg_name))
+                {
+                    string val_string(values[i]);
+                    istringstream str_stream(val_string);
+                    string::size_type old_pos = 0;
+                    string::size_type new_pos = 0;
+
+                    // Iterate comma-separated values
+                    T val;
+                    while ((new_pos = val_string.find(',', old_pos)) != string::npos)
+                    {
+                        if (new_pos != old_pos)
+                        {
+                            str_stream.width(new_pos - old_pos);
+                            str_stream >> val;
+                            vals.push_back(val);
+                        }
+
+                        // skip over comma
+                        str_stream.ignore(1);
+                        old_pos = new_pos + 1;
+                    }
+
+                    // Read last value
+                    str_stream >> val;
+                    vals.push_back(val);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * The number of pairs parsed
+     */
+    int ParsedArgc()
+    {
+        return (int) keys.size();
+    }
+
+    /**
+     * Initialize device
+     */
+    cudaError_t DeviceInit(int dev = -1)
+    {
+        cudaError_t error = cudaSuccess;
+
+        do
+        {
+            int deviceCount;
+            error = CubDebug(cudaGetDeviceCount(&deviceCount));
+            if (error) break;
+
+            if (deviceCount == 0) {
+                fprintf(stderr, "No devices supporting CUDA.\n");
+                exit(1);
+            }
+            if (dev < 0)
+            {
+                GetCmdLineArgument("device", dev);
+            }
+            if ((dev > deviceCount - 1) || (dev < 0))
+            {
+                dev = 0;
+            }
+
+            error = CubDebug(cudaSetDevice(dev));
+            if (error) break;
+
+            CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
+
+            int ptx_version = 0;
+            error = CubDebug(CUB_NS_QUALIFIER::PtxVersion(ptx_version));
+            if (error) break;
+
+            error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
+            if (error) break;
+
+            if (deviceProp.major < 1) {
+                fprintf(stderr, "Device does not support CUDA.\n");
+                exit(1);
+            }
+
+            device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
+
+            if (!CheckCmdLineFlag("quiet"))
+            {
+                printf(
+                        "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
+                        "%lld free / %lld total MB physmem, "
+                        "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
+                    dev,
+                    deviceProp.name,
+                    ptx_version,
+                    deviceProp.major * 100 + deviceProp.minor * 10,
+                    deviceProp.multiProcessorCount,
+                    (unsigned long long) device_free_physmem / 1024 / 1024,
+                    (unsigned long long) device_total_physmem / 1024 / 1024,
+                    device_giga_bandwidth,
+                    deviceProp.memoryClockRate,
+                    (deviceProp.ECCEnabled) ? "on" : "off");
+                fflush(stdout);
+            }
+
+        } while (0);
+
+        return error;
+    }
+};
+
+// Gets the amount of global memory of the current device.
+std::size_t TotalGlobalMem()
+{
+    int device = 0;
+    CubDebugExit(cudaGetDevice(&device));
+    std::size_t free_mem = 0, total_mem = 0;
+    CubDebugExit(cudaMemGetInfo(&free_mem, &total_mem));
+    return total_mem;
+}
+
+/******************************************************************************
+ * Random bits generator
+ ******************************************************************************/
+
+int g_num_rand_samples = 0;
+
+
+template <typename T>
+bool IsNaN(T /* val */) { return false; }
+
+template<>
+__noinline__ bool IsNaN<float>(float val)
+{
+  return std::isnan(val);
+}
+
+template<>
+__noinline__ bool IsNaN<float1>(float1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float2>(float2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float3>(float3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float4>(float4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+template<>
+__noinline__ bool IsNaN<double>(double val)
+{
+  return std::isnan(val);
+}
+
+template<>
+__noinline__ bool IsNaN<double1>(double1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double2>(double2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double3>(double3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double4>(double4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+
+template<>
+__noinline__ bool IsNaN<half_t>(half_t val)
+{
+    const auto bits = SafeBitCast<unsigned short>(val);
+
+    // commented bit is always true, leaving for documentation:
+    return (((bits >= 0x7C01) && (bits <= 0x7FFF)) ||
+        ((bits >= 0xFC01) /*&& (bits <= 0xFFFFFFFF)*/));
+}
+
+template<>
+__noinline__ bool IsNaN<bfloat16_t>(bfloat16_t val)
+{
+    const auto bits = SafeBitCast<unsigned short>(val);
+
+    // commented bit is always true, leaving for documentation:
+    return (((bits >= 0x7F81) && (bits <= 0x7FFF)) ||
+        ((bits >= 0xFF81) /*&& (bits <= 0xFFFFFFFF)*/));
+}
+
+/**
+ * Generates random keys.
+ *
+ * We always take the second-order byte from rand() because the higher-order
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ *
+ * We can decrease the entropy level of keys by adopting the technique
+ * of Thearling and Smith in which keys are computed from the bitwise AND of
+ * multiple random samples:
+ *
+ * entropy_reduction    | Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1                   | 0
+ * 0                    | 32
+ * 1                    | 25.95 (81%)
+ * 2                    | 17.41 (54%)
+ * 3                    | 10.78 (34%)
+ * 4                    | 6.42 (20%)
+ * ...                  | ...
+ *
+ */
+template <typename K>
+void RandomBits(
+    K &key,
+    int entropy_reduction = 0,
+    int begin_bit = 0,
+    int end_bit = sizeof(K) * 8)
+{
+    const int NUM_BYTES = sizeof(K);
+    const int WORD_BYTES = sizeof(unsigned int);
+    const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
+
+    unsigned int word_buff[NUM_WORDS];
+
+    if (entropy_reduction == -1)
+    {
+        memset((void *) &key, 0, sizeof(key));
+        return;
+    }
+
+    if (end_bit < 0)
+        end_bit = sizeof(K) * 8;
+
+    while (true)
+    {
+        // Generate random word_buff
+        for (int j = 0; j < NUM_WORDS; j++)
+        {
+            int current_bit = j * WORD_BYTES * 8;
+
+            unsigned int word = 0xffffffff;
+            word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
+            word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
+
+            for (int i = 0; i <= entropy_reduction; i++)
+            {
+                // Grab some of the higher bits from rand (better entropy, supposedly)
+                word &= mersenne::genrand_int32();
+                g_num_rand_samples++;
+            }
+
+            word_buff[j] = word;
+        }
+
+        memcpy(&key, word_buff, sizeof(K));
+
+        K copy = key;
+        if (!IsNaN(copy))
+            break;          // avoids NaNs when generating random floating point numbers
+    }
+}
+
+/// Randomly select number between [0:max)
+template <typename T>
+T RandomValue(T max)
+{
+    unsigned int bits;
+    unsigned int max_int = (unsigned int) -1;
+    do {
+        RandomBits(bits);
+    } while (bits == max_int);
+
+    return (T) ((double(bits) / double(max_int)) * double(max));
+}
+
+
+/******************************************************************************
+ * Test value initialization utilities
+ ******************************************************************************/
+
+/**
+ * Test problem generation options
+ */
+enum GenMode
+{
+    UNIFORM,                 // Assign to '2', regardless of integer seed
+    INTEGER_SEED,            // Assign to integer seed
+    RANDOM,                  // Assign to random, regardless of integer seed
+    RANDOM_BIT,              // Assign to randomly chosen 0 or 1, regardless of integer seed
+    RANDOM_MINUS_PLUS_ZERO,  // Assign to random, with some values being -0.0 or +0.0 patterns
+};
+
+/**
+ * Initialize value
+ */
+#pragma nv_exec_check_disable
+template <typename T>
+__host__ __device__ __forceinline__
+void InitValue(GenMode gen_mode, T &value, std::size_t index = 0)
+{
+  // RandomBits is host-only.
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (
+      switch (gen_mode) {
+      case RANDOM:
+        RandomBits(value);
+        break;
+      case RANDOM_BIT: {
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = static_cast<T>((c > 0) ? 1 : -1);
+        break;
+      }
+      case RANDOM_MINUS_PLUS_ZERO: {
+        // Replace roughly 1/128 of values with -0.0 or +0.0, and
+        // generate the rest randomly
+        using UnsignedBits = typename CUB_NS_QUALIFIER::Traits<T>::UnsignedBits;
+        char c;
+        RandomBits(c);
+        if (c == 0)
+        {
+          // Replace 1/256 of values with +0.0 bit pattern
+          value = SafeBitCast<T>(UnsignedBits(0));
+        }
+        else if (c == 1)
+        {
+          // Replace 1/256 of values with -0.0 bit pattern
+          value = SafeBitCast<T>(
+            UnsignedBits(UnsignedBits(1) << (sizeof(UnsignedBits) * 8) - 1));
+        }
+        else
+        {
+          // 127/128 of values are random
+          RandomBits(value);
+        }
+        break;
+      }
+      case UNIFORM:
+        value = 2;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = static_cast<T>(index);
+        break;
+      }),
+    ( // NV_IS_DEVICE:
+      switch (gen_mode) {
+      case RANDOM:
+      case RANDOM_BIT:
+      case RANDOM_MINUS_PLUS_ZERO:
+        _CubLog("%s\n",
+                "cub::InitValue cannot generate random numbers on device.");
+        CUB_NS_QUALIFIER::ThreadTrap();
+        break;
+      case UNIFORM:
+        value = 2;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = static_cast<T>(index);
+        break;
+      }
+    ));
+}
+
+/**
+ * Initialize value (bool)
+ */
+#pragma nv_exec_check_disable
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, std::size_t index = 0)
+{
+  // RandomBits is host-only.
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (
+      switch (gen_mode)
+      {
+      case RANDOM:
+      case RANDOM_BIT:
+          char c;
+          RandomBits(c, 0, 0, 1);
+          value = (c > 0);
+          break;
+       case UNIFORM:
+          value = true;
+          break;
+      case INTEGER_SEED:
+      default:
+          value = (index > 0);
+          break;
+      }
+    ),
+  ( // NV_IS_DEVICE,
+    switch (gen_mode)
+    {
+      case RANDOM:
+      case RANDOM_BIT:
+      case RANDOM_MINUS_PLUS_ZERO:
+        _CubLog("%s\n",
+                "cub::InitValue cannot generate random numbers on device.");
+        CUB_NS_QUALIFIER::ThreadTrap();
+        break;
+      case UNIFORM:
+        value = true;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = (index > 0);
+        break;
+    }
+  ));
+}
+
+
+/**
+ * cub::NullType test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode /* gen_mode */,
+						   CUB_NS_QUALIFIER::NullType &/* value */,
+                           std::size_t /* index */ = 0)
+{}
+
+
+/**
+ * cub::KeyValuePair<OffsetT, ValueT>test initialization
+ */
+#pragma nv_exec_check_disable
+template <typename KeyT, typename ValueT>
+__host__ __device__ __forceinline__ void InitValue(
+    GenMode                             gen_mode,
+    CUB_NS_QUALIFIER::KeyValuePair<KeyT, ValueT>&    value,
+    std::size_t                                      index = 0)
+{
+    InitValue(gen_mode, value.value, index);
+
+    // This specialization only appears to be used by test_warp_scan.
+    // It initializes with uniform values and random keys, so we need to
+    // protect the call to the host-only RandomBits.
+    // clang-format off
+    NV_IF_TARGET(NV_IS_HOST, (
+        // Assign corresponding flag with a likelihood of the last bit
+        // being set with entropy-reduction level 3
+        RandomBits(value.key, 3);
+        value.key = (value.key & 0x1);
+      ), ( // NV_IS_DEVICE
+        _CubLog("%s\n",
+                "cub::InitValue cannot generate random numbers on device.");
+        CUB_NS_QUALIFIER::ThreadTrap();
+      ));
+    // clang-format on
+}
+
+
+
+/******************************************************************************
+ * Comparison and ostream operators
+ ******************************************************************************/
+
+/**
+ * KeyValuePair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<Key, Value> &val)
+{
+    os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
+    return os;
+}
+
+#if CUB_IS_INT128_ENABLED
+static std::ostream& operator<<(std::ostream& os, __uint128_t val)
+{
+  constexpr int max_digits = 40;
+  char buffer[max_digits] = {};
+  char* digit = buffer + max_digits;
+  const char* ascii = "0123456789";
+
+  do 
+  {
+    digit--;
+    *digit = ascii[val % 10];
+    val /= 10;
+  }
+  while(val != 0);
+
+  for (; digit != buffer + max_digits; digit++) {
+    os << *digit;
+  }
+
+  return os;
+}
+
+static std::ostream& operator<<(std::ostream& os, __int128_t val)
+{
+  if (val < 0) {
+    __uint128_t tmp = -val;
+    os << '-' << tmp;
+  } else {
+    __uint128_t tmp = val;
+    os << tmp;
+  }
+
+  return os;
+}
+#endif
+
+
+/******************************************************************************
+ * Comparison and ostream operators for CUDA vector types
+ ******************************************************************************/
+
+/**
+ * Vector1 overloads
+ */
+#define CUB_VEC_OVERLOAD_1_OLD(T, BaseT)                    \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+    }                                                       \
+    CUB_NAMESPACE_BEGIN                                     \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static __host__ __device__ T Max()                  \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static __host__ __device__ T Lowest()               \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    CUB_NAMESPACE_END
+
+
+
+/**
+ * Vector2 overloads
+ */
+#define CUB_VEC_OVERLOAD_2_OLD(T, BaseT)                    \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+    }                                                       \
+    CUB_NAMESPACE_BEGIN                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static __host__ __device__ T Max()                  \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static __host__ __device__  T Lowest()              \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    CUB_NAMESPACE_END
+
+
+
+/**
+ * Vector3 overloads
+ */
+#define CUB_VEC_OVERLOAD_3_OLD(T, BaseT)                    \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+    }                                                       \
+    CUB_NAMESPACE_BEGIN                                     \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static __host__ __device__ T Max()                  \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static __host__ __device__ T Lowest()               \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    CUB_NAMESPACE_END
+
+
+/**
+ * Vector4 overloads
+ */
+#define CUB_VEC_OVERLOAD_4_OLD(T, BaseT)                    \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, std::size_t index = 0) \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+        InitValue(gen_mode, value.w, index);                \
+    }                                                       \
+    CUB_NAMESPACE_BEGIN                                     \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static __host__ __device__ T Max()                  \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static __host__ __device__ T Lowest()               \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    CUB_NAMESPACE_END
+
+/**
+ * All vector overloads
+ */
+#define CUB_VEC_OVERLOAD_OLD(COMPONENT_T, BaseT)                \
+    CUB_VEC_OVERLOAD_1_OLD(COMPONENT_T##1, BaseT)               \
+    CUB_VEC_OVERLOAD_2_OLD(COMPONENT_T##2, BaseT)               \
+    CUB_VEC_OVERLOAD_3_OLD(COMPONENT_T##3, BaseT)               \
+    CUB_VEC_OVERLOAD_4_OLD(COMPONENT_T##4, BaseT)
+
+/**
+ * Define for types
+ */
+CUB_VEC_OVERLOAD_OLD(char, signed char)
+CUB_VEC_OVERLOAD_OLD(short, short)
+CUB_VEC_OVERLOAD_OLD(int, int)
+CUB_VEC_OVERLOAD_OLD(long, long)
+CUB_VEC_OVERLOAD_OLD(longlong, long long)
+CUB_VEC_OVERLOAD_OLD(uchar, unsigned char)
+CUB_VEC_OVERLOAD_OLD(ushort, unsigned short)
+CUB_VEC_OVERLOAD_OLD(uint, unsigned int)
+CUB_VEC_OVERLOAD_OLD(ulong, unsigned long)
+CUB_VEC_OVERLOAD_OLD(ulonglong, unsigned long long)
+CUB_VEC_OVERLOAD_OLD(float, float)
+CUB_VEC_OVERLOAD_OLD(double, double)
+
+//---------------------------------------------------------------------
+// Complex data type TestFoo
+//---------------------------------------------------------------------
+
+/**
+ * TestFoo complex data type
+ */
+struct TestFoo
+{
+    using x_t = long long;
+    using y_t = int;
+    using z_t = short;
+    using w_t = char;
+
+    x_t x;
+    y_t y;
+    z_t z;
+    w_t w;
+
+    // Factory
+    static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
+    {
+        TestFoo retval = {x, y, z, w};
+        return retval;
+    }
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestFoo& operator =(int b)
+    {
+        x = static_cast<x_t>(b);
+        y = static_cast<y_t>(b);
+        z = static_cast<z_t>(b);
+        w = static_cast<w_t>(b);
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
+    {
+        return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
+    {
+        return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
+    {
+        return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        if (y < b.y) return true; else if (b.y < y) return false;
+        if (z < b.z) return true; else if (b.z < z) return false;
+        return w < b.w;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        if (y > b.y) return true; else if (b.y > y) return false;
+        if (z > b.z) return true; else if (b.z > z) return false;
+        return w > b.w;
+    }
+
+};
+
+/**
+ * TestFoo ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestFoo& val)
+{
+    os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
+    return os;
+}
+
+/**
+ * TestFoo test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, std::size_t index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+    InitValue(gen_mode, value.z, index);
+    InitValue(gen_mode, value.w, index);
+}
+
+
+/// numeric_limits<TestFoo> specialization
+CUB_NAMESPACE_BEGIN
+template<>
+struct NumericTraits<TestFoo>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+  __host__ __device__ static TestFoo Max()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max(),
+            NumericTraits<short>::Max(),
+            NumericTraits<char>::Max());
+    }
+
+  __host__ __device__ static TestFoo Lowest()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest(),
+            NumericTraits<short>::Lowest(),
+            NumericTraits<char>::Lowest());
+    }
+};
+CUB_NAMESPACE_END
+
+
+//---------------------------------------------------------------------
+// Complex data type TestBar (with optimizations for fence-free warp-synchrony)
+//---------------------------------------------------------------------
+
+/**
+ * TestBar complex data type
+ */
+struct TestBar
+{
+    long long       x;
+    int             y;
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
+    {}
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestBar& operator =(int b)
+    {
+        x = b;
+        y = b;
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
+    {
+        return TestBar(x + b.x, y + b.y);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
+    {
+        return (x != b.x) || (y != b.y);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
+    {
+        return (x == b.x) && (y == b.y);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        return y < b.y;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        return y > b.y;
+    }
+
+};
+
+
+/**
+ * TestBar ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestBar& val)
+{
+    os << '(' << val.x << ',' << val.y << ')';
+    return os;
+}
+
+/**
+ * TestBar test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, std::size_t index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+}
+
+/// numeric_limits<TestBar> specialization
+CUB_NAMESPACE_BEGIN
+template<>
+struct NumericTraits<TestBar>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+    __host__ __device__ static TestBar Max()
+    {
+        return TestBar(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max());
+    }
+
+    __host__ __device__ static TestBar Lowest()
+    {
+        return TestBar(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest());
+    }
+};
+CUB_NAMESPACE_END
+
+
+/******************************************************************************
+ * Helper routines for list comparison and display
+ ******************************************************************************/
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename S, typename T, typename OffsetT>
+int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                << CoutCast(computed[i]) << " != "
+                << CoutCast(reference[i]);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            float difference = std::abs(computed[i]-reference[i]);
+            float fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.00015)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << "(computed) " << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(CUB_NS_QUALIFIER::NullType* computed, CUB_NS_QUALIFIER::NullType* reference, OffsetT len, bool verbose = true)
+{
+    return 0;
+}
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            double difference = std::abs(computed[i]-reference[i]);
+            double fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.00015)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+int CompareDeviceResults(
+    CUB_NS_QUALIFIER::NullType */* h_reference */,
+    CUB_NS_QUALIFIER::NullType */* d_data */,
+    std::size_t /* num_items */,
+    bool /* verbose */ = true,
+    bool /* display_data */ = false)
+{
+    return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename OffsetT>
+int CompareDeviceResults(
+    S */*h_reference*/,
+    CUB_NS_QUALIFIER::DiscardOutputIterator<OffsetT> /*d_data*/,
+    std::size_t /*num_items*/,
+    bool /*verbose*/ = true,
+    bool /*display_data*/ = false)
+{
+    return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename T>
+int CompareDeviceResults(
+    S *h_reference,
+    T *d_data,
+    std::size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    if (num_items == 0)
+    {
+        return 0;
+    }
+
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data)
+    {
+        printf("Reference:\n");
+        for (std::size_t i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (std::size_t i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a device array
+ */
+template <typename T>
+int CompareDeviceDeviceResults(
+    T *d_reference,
+    T *d_data,
+    std::size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    // Allocate array on host
+    T *h_reference = (T*) malloc(num_items * sizeof(T));
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data) {
+        printf("Reference:\n");
+        for (std::size_t i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (std::size_t i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_reference) free(h_reference);
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Print the contents of a host array
+ */
+void DisplayResults(
+    CUB_NS_QUALIFIER::NullType   */* h_data */,
+    std::size_t      /* num_items */)
+{}
+
+
+/**
+ * Print the contents of a host array
+ */
+template <typename InputIteratorT>
+void DisplayResults(
+    InputIteratorT h_data,
+    std::size_t num_items)
+{
+    // Display data
+    for (std::size_t i = 0; i < num_items; i++)
+    {
+        std::cout << CoutCast(h_data[i]) << ", ";
+    }
+    printf("\n");
+}
+
+
+/**
+ * Print the contents of a device array
+ */
+template <typename T>
+void DisplayDeviceResults(
+    T *d_data,
+    std::size_t num_items)
+{
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    DisplayResults(h_data, num_items);
+
+    // Cleanup
+    if (h_data) free(h_data);
+}
+
+
+/******************************************************************************
+ * Segment descriptor generation
+ ******************************************************************************/
+
+/**
+ * Initialize segments
+ */
+template <typename OffsetT>
+void InitializeSegments(
+    OffsetT     num_items,
+    int         num_segments,
+    OffsetT     *h_segment_offsets,
+    bool        verbose = false)
+{
+    if (num_segments <= 0)
+        return;
+
+    OffsetT expected_segment_length = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, OffsetT(num_segments));
+    OffsetT offset = 0;
+    for (int i = 0; i < num_segments; ++i)
+    {
+        h_segment_offsets[i] = offset;
+
+        OffsetT segment_length = RandomValue((expected_segment_length * 2) + 1);
+        offset += segment_length;
+        offset = CUB_MIN(offset, num_items);
+    }
+    h_segment_offsets[num_segments] = num_items;
+
+    if (verbose)
+    {
+        printf("Segment offsets: ");
+        DisplayResults(h_segment_offsets, num_segments + 1);
+    }
+}
+
+
+/******************************************************************************
+ * Timing
+ ******************************************************************************/
+
+
+struct CpuTimer
+{
+#if defined(_WIN32) || defined(_WIN64)
+
+    LARGE_INTEGER ll_freq;
+    LARGE_INTEGER ll_start;
+    LARGE_INTEGER ll_stop;
+
+    CpuTimer()
+    {
+        QueryPerformanceFrequency(&ll_freq);
+    }
+
+    void Start()
+    {
+        QueryPerformanceCounter(&ll_start);
+    }
+
+    void Stop()
+    {
+        QueryPerformanceCounter(&ll_stop);
+    }
+
+    float ElapsedMillis()
+    {
+        double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
+        double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
+
+        return float((stop - start) * 1000);
+    }
+
+#else
+
+    rusage start;
+    rusage stop;
+
+    void Start()
+    {
+        getrusage(RUSAGE_SELF, &start);
+    }
+
+    void Stop()
+    {
+        getrusage(RUSAGE_SELF, &stop);
+    }
+
+    float ElapsedMillis()
+    {
+        float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
+        float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
+
+        return (sec * 1000) + (usec / 1000);
+    }
+
+#endif
+};
+
+struct GpuTimer
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+
+    GpuTimer()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+    }
+
+    ~GpuTimer()
+    {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+
+    void Start()
+    {
+        cudaEventRecord(start, 0);
+    }
+
+    void Stop()
+    {
+        cudaEventRecord(stop, 0);
+    }
+
+    float ElapsedMillis()
+    {
+        float elapsed;
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&elapsed, start, stop);
+        return elapsed;
+    }
+};
+
+struct HugeDataType
+{
+  static constexpr int ELEMENTS_PER_OBJECT = 128;
+
+  __device__ __host__ HugeDataType()
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = 0;
+    }
+  }
+
+  __device__ __host__ HugeDataType(const HugeDataType&rhs)
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = rhs.data[i];
+    }
+  }
+
+  explicit __device__ __host__ HugeDataType(int val)
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = val;
+    }
+  }
+
+  int data[ELEMENTS_PER_OBJECT];
+};
+
+inline __device__ __host__ bool operator==(const HugeDataType &lhs,
+                                           const HugeDataType &rhs)
+{
+  for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] != rhs.data[i])
+    {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+inline __device__ __host__ bool operator<(const HugeDataType &lhs,
+                                          const HugeDataType &rhs)
+{
+  for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] < rhs.data[i])
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+template <typename DataType>
+__device__ __host__ bool operator!=(const HugeDataType &lhs,
+                                    const DataType &rhs)
+{
+  for (int i = 0; i < HugeDataType::ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] != rhs)
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
diff --git a/include/cub/test/test_util_vec.h b/include/cub/test/test_util_vec.h
new file mode 100644
index 0000000..bb6bb67
--- /dev/null
+++ b/include/cub/test/test_util_vec.h
@@ -0,0 +1,320 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <iostream>
+
+
+/******************************************************************************
+ * Console printing utilities
+ ******************************************************************************/
+
+/**
+ * Helper for casting character types to integers for cout printing
+ */
+template <typename T>
+T CoutCast(T val) { return val; }
+
+inline int CoutCast(char val) { return val; }
+
+inline int CoutCast(unsigned char val) { return val; }
+
+inline int CoutCast(signed char val) { return val; }
+
+/******************************************************************************
+ * Comparison and ostream operators for CUDA vector types
+ ******************************************************************************/
+
+/**
+ * Vector1 overloads
+ */
+#define CUB_VEC_OVERLOAD_1(T, BaseT)                        \
+    /* Ostream output */                                    \
+    inline std::ostream& operator<<(                        \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '(' << CoutCast(val.x) << ')';                \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    inline __host__ __device__ bool operator!=(             \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x);                                \
+    }                                                       \
+    /* Equality */                                          \
+    inline __host__ __device__ bool operator==(             \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x);                                \
+    }                                                       \
+    /* Max */                                               \
+    inline __host__ __device__ bool operator>(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x > b.x);                                 \
+    }                                                       \
+    /* Min */                                               \
+    inline __host__ __device__ bool operator<(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x < b.x);                                 \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                       \
+    inline __host__ __device__ T operator+(                 \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(a.x + b.x);                     \
+        return retval;                                      \
+    }                                                       
+
+
+
+/**
+ * Vector2 overloads
+ */
+#define CUB_VEC_OVERLOAD_2(T, BaseT)                        \
+    /* Ostream output */                                    \
+    inline std::ostream& operator<<(                        \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    inline __host__ __device__ bool operator!=(  \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    inline __host__ __device__ bool operator==(             \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y);                                   \
+    }                                                       \
+    /* Max */                                               \
+    inline __host__ __device__ bool operator>(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        return a.y > b.y;                                               \
+    }                                                       \
+    /* Min */                                               \
+    inline __host__ __device__ bool operator<(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        return a.y < b.y;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    inline __host__ __device__ T operator+(                 \
+               T a,                                         \
+               T b)                                         \
+    {                                                       \
+        T retval = make_##T(                                \
+            a.x + b.x,                                      \
+            a.y + b.y);                                     \
+        return retval;                                      \
+    }                                                       \
+
+
+
+/**
+ * Vector3 overloads
+ */
+#define CUB_VEC_OVERLOAD_3(T, BaseT)                        \
+    /* Ostream output */                                    \
+    inline std::ostream& operator<<(                        \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    inline __host__ __device__ bool operator!=(             \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    inline __host__ __device__ bool operator==(             \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z);                                   \
+    }                                                       \
+    /* Max */                                               \
+    inline __host__ __device__ bool operator>(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        return a.z > b.z;                                               \
+    }                                                       \
+    /* Min */                                               \
+    inline __host__ __device__ bool operator<(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        return a.z < b.z;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    inline __host__ __device__ T operator+(                 \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z);                                     \
+        return retval;                                      \
+    }                                                       
+
+
+/**
+ * Vector4 overloads
+ */
+#define CUB_VEC_OVERLOAD_4(T, BaseT)                        \
+    /* Ostream output */                                    \
+    inline std::ostream& operator<<(                        \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ','                       \
+            << CoutCast(val.w) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    inline __host__ __device__ bool operator!=(             \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z) ||                                 \
+            (a.w != b.w);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    inline __host__ __device__ bool operator==(             \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z) &&                                 \
+            (a.w == b.w);                                   \
+    }                                                       \
+    /* Max */                                               \
+    inline __host__ __device__ bool operator>(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        if (a.z > b.z) return true; else if (b.z > a.z) return false;   \
+        return a.w > b.w;                                               \
+    }                                                       \
+    /* Min */                                               \
+    inline __host__ __device__ bool operator<(              \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        if (a.z < b.z) return true; else if (b.z < a.z) return false;   \
+        return a.w < b.w;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    inline __host__ __device__ T operator+(                 \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z,                                      \
+            a.w + b.w);                                     \
+        return retval;                                      \
+    }                                                       
+
+/**
+ * All vector overloads
+ */
+#define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT)                    \
+    CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT)                   \
+    CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT)                   \
+    CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT)                   \
+    CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
+
+/**
+ * Define for types
+ */
+CUB_VEC_OVERLOAD(char, char)
+CUB_VEC_OVERLOAD(short, short)
+CUB_VEC_OVERLOAD(int, int)
+CUB_VEC_OVERLOAD(long, long)
+CUB_VEC_OVERLOAD(longlong, long long)
+CUB_VEC_OVERLOAD(uchar, unsigned char)
+CUB_VEC_OVERLOAD(ushort, unsigned short)
+CUB_VEC_OVERLOAD(uint, unsigned int)
+CUB_VEC_OVERLOAD(ulong, unsigned long)
+CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
+CUB_VEC_OVERLOAD(float, float)
+CUB_VEC_OVERLOAD(double, double)
+
diff --git a/results/T4/crystal-fls/crystal_fls_q11_sf10.txt b/results/T4/crystal-fls/crystal_fls_q11_sf10.txt
new file mode 100644
index 0000000..7f2319c
--- /dev/null
+++ b/results/T4/crystal-fls/crystal_fls_q11_sf10.txt
@@ -0,0 +1,185 @@
+==PROF== Connected to process 34112 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q11_bitpacked)
+==PROF== Profiling "QueryKernel" - 0: 0%....50%....100% - 35 passes
+[33m-- lo_orderdate_min: 19920101[39m
+[33m-- lo_orderdate_max: 19980802[39m
+[33m-- lo_discount_min: 0[39m
+[33m-- lo_discount_max: 10[39m
+[33m-- lo_quantity_min: 1[39m
+[33m-- lo_quantity_max: 50[39m
+[33m-- lo_extendedprice_min: 90097[39m
+[33m-- lo_extendedprice_max: 10494900[39m
+[33m-- x: 16[39m
+[33m-- LOADED DATA[39m
+[33m-- LOADED DATA TO GPU[39m
+[33m-- total_time_taken: 7839.23[39m
+[33m-- revenue: 4471898856447[39m
+[32m-- SF_10[39m
+[1m[34m-- 7839.17[39m
+==PROF== Disconnected from process 34112
+[34112] fls_q11_bitpacked@127.0.0.1
+  void QueryKernel<(int)32, (int)32>(const int *, const int *, const int *, const int *, fastlanes::ssb::SSB, unsigned long long *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.98
+    SM Frequency            cycle/usecond       583.09
+    Elapsed Cycles                  cycle      1121660
+    Memory Throughput                   %        79.65
+    DRAM Throughput                     %        79.65
+    Duration                      msecond         1.92
+    L1/TEX Cache Throughput             %        59.91
+    L2 Cache Throughput                 %        22.46
+    SM Active Cycles                cycle   1118042.30
+    Compute (SM) Throughput             %        54.25
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.50
+    Executed Ipc Elapsed  inst/cycle         1.49
+    Issue Slots Busy               %        37.40
+    Issued Ipc Active     inst/cycle         1.50
+    SM Busy                        %        54.43
+    -------------------- ----------- ------------
+
+    INF   ALU is the highest-utilized pipeline (54.4%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes integer and logic operations. It is well-utilized, but should not be a    
+          bottleneck.                                                                                                   
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       253.84
+    Mem Busy                     %        29.96
+    Max Bandwidth                %        79.65
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         0.54
+    Mem Pipes Busy               %        52.75
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 6.386%                                                                                          
+          The memory access pattern for shared stores might not be optimal and causes on average a 1.1 - way bank       
+          conflict across all 7556949 shared store requests.This results in 901567 bank conflicts,  which represent     
+          10.66% of the overall 8458516 wavefronts for shared stores. Check the Source Counters section for             
+          uncoalesced shared stores.                                                                                    
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        37.29
+    Issued Warp Per Scheduler                        0.37
+    No Eligible                            %        62.71
+    Active Warps Per Scheduler          warp         3.67
+    Eligible Warps Per Scheduler        warp         0.55
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 20.35%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 2.7 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          3.67 active warps per scheduler, but only an average of 0.55 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         9.84
+    Warp Cycles Per Executed Instruction           cycle         9.84
+    Avg. Active Threads Per Warp                                31.86
+    Avg. Not Predicated Off Threads Per Warp                    31.78
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 20.35%                                                                                          
+          On average, each warp of this kernel spends 5.2 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 53.0% of the total average of 9.8 cycles between issuing two  
+          instructions.                                                                                                 
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    418125.73
+    Executed Instructions                           inst     66900117
+    Avg. Issued Instructions Per Scheduler          inst    418136.85
+    Issued Instructions                             inst     66901896
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             109
+    Shared Memory Configuration Size           Kbyte           65.54
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.35
+    Threads                                   thread         1874592
+    Waves Per SM                                               97.64
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           15
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           15
+    Theoretical Occupancy                     %        46.88
+    Achieved Occupancy                        %        46.05
+    Achieved Active Warps Per SM           warp        14.73
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 20.35%                                                                                          
+          The 3.75 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (46.9%) is limited by the required amount of       
+          shared memory.                                                                                                
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.01
+    Branch Instructions              inst       995882
+    Branch Efficiency                   %          100
+    Avg. Divergent Branches                          0
+    ------------------------- ----------- ------------
+
diff --git a/results/T4/crystal-fls/crystal_fls_q21.txt b/results/T4/crystal-fls/crystal_fls_q21.txt
new file mode 100644
index 0000000..d6102da
--- /dev/null
+++ b/results/T4/crystal-fls/crystal_fls_q21.txt
@@ -0,0 +1,1037 @@
+==PROF== Connected to process 22160 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q21_bitpacked)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+1992 40 620792484
+1993 40 696134057
+1994 40 646183449
+1995 40 683789210
+1996 40 510952202
+1997 40 601329805
+1998 40 310212681
+1992 41 619072460
+1993 41 641906797
+1994 41 600001881
+1995 41 538237825
+1996 41 626948592
+1997 41 599768514
+1998 41 324844587
+1992 42 490930542
+1993 42 590494981
+1994 42 637973270
+1995 42 563917939
+1996 42 489891798
+1997 42 575414713
+1998 42 382795528
+1992 43 746180561
+1993 43 667809574
+1994 43 582681681
+1995 43 622468901
+1996 43 615832436
+1997 43 665181553
+1998 43 391879175
+1992 44 592643829
+1993 44 547150428
+1994 44 687417394
+1995 44 557813495
+1996 44 539509100
+1997 44 591240347
+1998 44 339362761
+1992 45 666818456
+1993 45 614006733
+1994 45 553771550
+1995 45 718580318
+1996 45 604069580
+1997 45 582504582
+1998 45 418644887
+1992 46 497930218
+1993 46 603429138
+1994 46 570930370
+1995 46 705815966
+1996 46 595735717
+1997 46 652582336
+1998 46 460900222
+1992 47 573950112
+1993 47 581793154
+1994 47 555531717
+1995 47 684622773
+1996 47 655735847
+1997 47 644209268
+1998 47 368241619
+1992 48 629436211
+1993 48 732890335
+1994 48 616859338
+1995 48 687199268
+1996 48 741147488
+1997 48 755567624
+1998 48 436528223
+1992 49 665878691
+1993 49 584135775
+1994 49 589016154
+1995 49 607002298
+1996 49 563497671
+1997 49 545937355
+1998 49 309175941
+1992 50 553577586
+1993 50 555705138
+1994 50 676062441
+1995 50 641710612
+1996 50 575388404
+1997 50 578436711
+1998 50 314285647
+1992 51 656700076
+1993 51 627130013
+1994 51 591728980
+1995 51 657535469
+1996 51 658819802
+1997 51 654743943
+1998 51 391790954
+1992 52 571366086
+1993 52 610589229
+1994 52 544138184
+1995 52 529390478
+1996 52 612212648
+1997 52 520737479
+1998 52 253877981
+1992 53 532838175
+1993 53 556392755
+1994 53 506771410
+1995 53 567384149
+1996 53 471880515
+1997 53 589317682
+1998 53 286129583
+1992 54 575100090
+1993 54 598048745
+1994 54 669399089
+1995 54 673188844
+1996 54 607298942
+1997 54 682448220
+1998 54 396578150
+1992 55 575644119
+1993 55 538626597
+1994 55 641068147
+1995 55 683443283
+1996 55 628222285
+1997 55 578103277
+1998 55 348596079
+1992 56 562595481
+1993 56 572869443
+1994 56 523516106
+1995 56 534863977
+1996 56 536099358
+1997 56 590451889
+1998 56 317284773
+1992 57 514802952
+1993 57 493315679
+1994 57 599287565
+1995 57 596024828
+1996 57 615338121
+1997 57 598601936
+1998 57 308349815
+1992 58 543346708
+1993 58 604020487
+1994 58 515506085
+1995 58 599834564
+1996 58 517842408
+1997 58 608170121
+1998 58 341434816
+1992 59 491786465
+1993 59 655668497
+1994 59 655183200
+1995 59 584917742
+1996 59 559185452
+1997 59 576734822
+1998 59 326633797
+1992 60 602373460
+1993 60 615880897
+1994 60 643804380
+1995 60 713302883
+1996 60 623220244
+1997 60 680711137
+1998 60 354376769
+1992 61 633928973
+1993 61 565901926
+1994 61 647661017
+1995 61 647289672
+1996 61 637768457
+1997 61 593124378
+1998 61 380715354
+1992 62 554701238
+1993 62 565208933
+1994 62 718895078
+1995 62 609303895
+1996 62 691969792
+1997 62 631016696
+1998 62 358310182
+1992 63 546043513
+1993 63 660789968
+1994 63 655833720
+1995 63 702057957
+1996 63 653344348
+1997 63 550179447
+1998 63 419353251
+1992 64 647297520
+1993 64 582390534
+1994 64 529474222
+1995 64 560461020
+1996 64 591003083
+1997 64 564085649
+1998 64 398848738
+1992 65 799379105
+1993 65 576848715
+1994 65 636493983
+1995 65 713329066
+1996 65 633922964
+1997 65 684284629
+1998 65 335073096
+1992 66 795689545
+1993 66 759898311
+1994 66 697404326
+1995 66 693856011
+1996 66 605367841
+1997 66 682817524
+1998 66 372528215
+1992 67 648627112
+1993 67 649305965
+1994 67 543254019
+1995 67 737599852
+1996 67 646443167
+1997 67 703348298
+1998 67 356128002
+1992 68 608555802
+1993 68 573449583
+1994 68 610859739
+1995 68 628687768
+1996 68 689535294
+1997 68 638125635
+1998 68 387752384
+1992 69 616083074
+1993 69 603750123
+1994 69 566272871
+1995 69 693347954
+1996 69 621193535
+1997 69 569915068
+1998 69 371569162
+1992 70 627072231
+1993 70 554942415
+1994 70 736308788
+1995 70 589463137
+1996 70 701770686
+1997 70 561626445
+1998 70 378101727
+1992 71 710702139
+1993 71 699720829
+1994 71 666578569
+1995 71 576221762
+1996 71 535280597
+1997 71 628168690
+1998 71 405952025
+1992 72 595000850
+1993 72 596286095
+1994 72 590361006
+1995 72 665356177
+1996 72 595905720
+1997 72 624894859
+1998 72 373008532
+1992 73 525131548
+1993 73 594835274
+1994 73 586002871
+1995 73 533249668
+1996 73 585914955
+1997 73 478354667
+1998 73 362991667
+1992 74 662752933
+1993 74 632459703
+1994 74 662533721
+1995 74 709515121
+1996 74 661386832
+1997 74 611544878
+1998 74 322587523
+1992 75 518515033
+1993 75 601539100
+1994 75 551806661
+1995 75 601270873
+1996 75 600967332
+1997 75 466864598
+1998 75 274934885
+1992 76 775269032
+1993 76 773058041
+1994 76 730757877
+1995 76 675467554
+1996 76 814445849
+1997 76 780193108
+1998 76 423598818
+1992 77 633158355
+1993 77 713278246
+1994 77 620767558
+1995 77 634874801
+1996 77 704631788
+1997 77 639898919
+1998 77 392782198
+1992 78 512420094
+1993 78 523847906
+1994 78 529135579
+1995 78 578182924
+1996 78 511907354
+1997 78 570804688
+1998 78 370584319
+1992 79 709935019
+1993 79 539722143
+1994 79 596339358
+1995 79 697095930
+1996 79 783593202
+1997 79 652546402
+1998 79 435709816
+Res Count: 280
+Time Taken Total: 25998.2
+{"query":21,"time_query":25998}
+==PROF== Disconnected from process 22160
+[22160] fls_q21_bitpacked@127.0.0.1
+  void build_hashtable_s<(int)32, (int)32>(int *, int *, int, int *, int) (2, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.67
+    SM Frequency            cycle/usecond       546.56
+    Elapsed Cycles                  cycle         9533
+    Memory Throughput                   %         0.68
+    DRAM Throughput                     %         0.68
+    Duration                      usecond        17.44
+    L1/TEX Cache Throughput             %         4.45
+    L2 Cache Throughput                 %         0.27
+    SM Active Cycles                cycle       417.95
+    Compute (SM) Throughput             %         0.16
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.15
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         3.72
+    Issued Ipc Active     inst/cycle         0.15
+    SM Busy                        %         3.72
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.31%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         2.03
+    Mem Busy                     %         0.27
+    Max Bandwidth                %         0.68
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        42.23
+    Mem Pipes Busy               %         0.10
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        13.94
+    Issued Warp Per Scheduler                        0.14
+    No Eligible                            %        86.06
+    Active Warps Per Scheduler          warp         0.93
+    Eligible Warps Per Scheduler        warp         0.14
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 86.06%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.2 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.93 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         6.70
+    Warp Cycles Per Executed Instruction           cycle         6.75
+    Avg. Active Threads Per Warp                                14.56
+    Avg. Not Predicated Off Threads Per Warp                    13.73
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 48.75%                                                                                          
+          On average, each warp of this kernel spends 3.3 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 48.7% of the total average of 6.7 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.09308%                                                                                        
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 14.6 threads being active per cycle. This is further reduced    
+          to 13.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        15.43
+    Executed Instructions                           inst         2469
+    Avg. Issued Instructions Per Scheduler          inst        15.54
+    Issued Instructions                             inst         2486
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      2
+    Registers Per Thread             register/thread              72
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              64
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 95%                                                                                             
+          The grid for this launch is configured to execute only 2 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           28
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 86.06%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared    
+          memory.                                                                                                       
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst          206
+    Branch Efficiency                   %        15.07
+    Avg. Divergent Branches                       0.39
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 1.311%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 136 excessive sectors (19% of the total   
+          709 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The   
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)32, (int)32>(int *, int *, int *, int, int *, int) (196, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.49
+    SM Frequency            cycle/usecond       527.32
+    Elapsed Cycles                  cycle        13893
+    Memory Throughput                   %        46.26
+    DRAM Throughput                     %        46.26
+    Duration                      usecond        26.34
+    L1/TEX Cache Throughput             %        17.28
+    L2 Cache Throughput                 %        14.14
+    SM Active Cycles                cycle     11938.40
+    Compute (SM) Throughput             %        10.13
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.3 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.41
+    Executed Ipc Elapsed  inst/cycle         0.35
+    Issue Slots Busy               %        10.28
+    Issued Ipc Active     inst/cycle         0.41
+    SM Busy                        %        10.28
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.76%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       132.97
+    Mem Busy                     %        14.14
+    Max Bandwidth                %        46.26
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        10.13
+    Mem Pipes Busy               %        10.13
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.8045%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         9.53
+    Issued Warp Per Scheduler                        0.10
+    No Eligible                            %        90.47
+    Active Warps Per Scheduler          warp         1.08
+    Eligible Warps Per Scheduler        warp         0.10
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 53.74%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 10.5 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          1.08 active warps per scheduler, but only an average of 0.10 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        11.29
+    Warp Cycles Per Executed Instruction           cycle        11.39
+    Avg. Active Threads Per Warp                                12.54
+    Avg. Not Predicated Off Threads Per Warp                    12.32
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 30.45%                                                                                          
+          On average, each warp of this kernel spends 3.4 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 30.4% of the total average of 11.3 cycles between issuing two instructions.                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 6.227%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 12.5 threads being active per cycle. This is further reduced    
+          to 12.3 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      1216.10
+    Executed Instructions                           inst       194576
+    Avg. Issued Instructions Per Scheduler          inst      1226.67
+    Issued Instructions                             inst       196268
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    196
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            6272
+    Waves Per SM                                                0.31
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %        14.36
+    Achieved Active Warps Per SM           warp         4.59
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 53.74%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (14.4%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the number of required           
+          registers. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst        18673
+    Branch Efficiency                   %        37.51
+    Avg. Divergent Branches                      27.57
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.301%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 5904 excessive sectors (7% of the total   
+          90018 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.     
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)32, (int)32>(int *, int *, int, int *, int, int) (3, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.56
+    SM Frequency            cycle/usecond       534.35
+    Elapsed Cycles                  cycle         6242
+    Memory Throughput                   %         2.01
+    DRAM Throughput                     %         2.01
+    Duration                      usecond        11.68
+    L1/TEX Cache Throughput             %        26.06
+    L2 Cache Throughput                 %         1.12
+    SM Active Cycles                cycle       301.85
+    Compute (SM) Throughput             %         0.28
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.18
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         4.60
+    Issued Ipc Active     inst/cycle         0.18
+    SM Busy                        %         4.60
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.59%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         5.85
+    Mem Busy                     %         0.89
+    Max Bandwidth                %         2.01
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        50.40
+    Mem Pipes Busy               %         0.28
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.02929%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1023%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        16.47
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        83.53
+    Active Warps Per Scheduler          warp         0.89
+    Eligible Warps Per Scheduler        warp         0.16
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 83.53%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.1 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.89 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         5.40
+    Warp Cycles Per Executed Instruction           cycle         5.46
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    27.49
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        13.71
+    Executed Instructions                           inst         2193
+    Avg. Issued Instructions Per Scheduler          inst        13.88
+    Issued Instructions                             inst         2221
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      3
+    Registers Per Thread             register/thread              80
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              96
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 92.5%                                                                                           
+          The grid for this launch is configured to execute only 3 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           24
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 83.53%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared    
+          memory.                                                                                                       
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.04
+    Branch Instructions              inst           83
+    Branch Efficiency                   %        97.62
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.09%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)32, (int)32>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (5861, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.96
+    SM Frequency            cycle/usecond       581.03
+    Elapsed Cycles                  cycle       493782
+    Memory Throughput                   %        36.43
+    DRAM Throughput                     %        36.43
+    Duration                      usecond       849.82
+    L1/TEX Cache Throughput             %        42.09
+    L2 Cache Throughput                 %        13.98
+    SM Active Cycles                cycle    470466.08
+    Compute (SM) Throughput             %        21.02
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.88
+    Executed Ipc Elapsed  inst/cycle         0.84
+    Issue Slots Busy               %        22.07
+    Issued Ipc Active     inst/cycle         0.88
+    SM Busy                        %        22.07
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 83.36%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       115.69
+    Mem Busy                     %        21.04
+    Max Bandwidth                %        36.43
+    L1/TEX Hit Rate              %        63.95
+    L2 Hit Rate                  %        40.79
+    Mem Pipes Busy               %        17.17
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.489%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.6 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 10.3 sectors per request, or 10.3*32 = 329.1 bytes of cache data transfers per request.   
+          The optimal thread address pattern for 4.6 byte accesses would result in 4.6*32 = 148.3 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.757%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.7 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1732%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        21.82
+    Issued Warp Per Scheduler                        0.22
+    No Eligible                            %        78.18
+    Active Warps Per Scheduler          warp         2.84
+    Eligible Warps Per Scheduler        warp         0.27
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 63.57%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.6 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          2.84 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.01
+    Warp Cycles Per Executed Instruction           cycle        13.01
+    Avg. Active Threads Per Warp                                20.15
+    Avg. Not Predicated Off Threads Per Warp                    18.58
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 38.41%                                                                                          
+          On average, each warp of this kernel spends 5.0 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 38.4% of the total average of 13.0 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 8.818%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.2 threads being active per cycle. This is further reduced    
+          to 18.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    103802.57
+    Executed Instructions                           inst     16608412
+    Avg. Issued Instructions Per Scheduler          inst    103813.16
+    Issued Instructions                             inst     16610106
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   5861
+    Registers Per Thread             register/thread             166
+    Shared Memory Configuration Size           Kbyte           65.54
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.10
+    Threads                                   thread          187552
+    Waves Per SM                                               12.21
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           12
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           12
+    Theoretical Occupancy                     %        37.50
+    Achieved Occupancy                        %        35.39
+    Achieved Active Warps Per SM           warp        11.33
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 62.5%                                                                                           
+          The 3.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (37.5%) is limited by the number of required       
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst      1422094
+    Branch Efficiency                   %        54.59
+    Avg. Divergent Branches                    1663.51
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 56.91%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 5692164 excessive sectors (66% of the     
+          total 8574206 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source        
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal-fls/crystal_fls_q21_sf10.txt b/results/T4/crystal-fls/crystal_fls_q21_sf10.txt
new file mode 100644
index 0000000..0c51645
--- /dev/null
+++ b/results/T4/crystal-fls/crystal_fls_q21_sf10.txt
@@ -0,0 +1,1034 @@
+==PROF== Connected to process 3281 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q21_bitpacked)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+1992 40 5910703807
+1993 40 6221118002
+1994 40 5930589067
+1995 40 5935176587
+1996 40 5813459646
+1997 40 5932327551
+1998 40 3617033909
+1992 41 6142605437
+1993 41 6513938496
+1994 41 6255096718
+1995 41 6188290908
+1996 41 6088744091
+1997 41 6245316071
+1998 41 3627954097
+1992 42 5899765766
+1993 42 6325451795
+1994 42 6379855056
+1995 42 6253125905
+1996 42 6108666329
+1997 42 6157934476
+1998 42 3558798602
+1992 43 5806898037
+1993 43 5770931893
+1994 43 6096087079
+1995 43 6065752404
+1996 43 6002900479
+1997 43 5860606190
+1998 43 3678545331
+1992 44 5559682659
+1993 44 5813306579
+1994 44 5926068761
+1995 44 5608176605
+1996 44 5735975188
+1997 44 5836274168
+1998 44 3134706225
+1992 45 6474983674
+1993 45 6400588001
+1994 45 6331198167
+1995 45 6394371935
+1996 45 6559249979
+1997 45 6645487151
+1998 45 3850121319
+1992 46 6587090794
+1993 46 6382032832
+1994 46 6775614290
+1995 46 6442574114
+1996 46 6632812978
+1997 46 6814132782
+1998 46 3834827103
+1992 47 6808158717
+1993 47 6351643075
+1994 47 6804633795
+1995 47 6088726153
+1996 47 6623642056
+1997 47 6639575295
+1998 47 3406959500
+1992 48 6911435378
+1993 48 7053764786
+1994 48 6774194398
+1995 48 6814377370
+1996 48 6711754718
+1997 48 6709492543
+1998 48 4023772218
+1992 49 5727394828
+1993 49 5660353157
+1994 49 5642438266
+1995 49 5677870960
+1996 49 5681672438
+1997 49 5832554864
+1998 49 3345738545
+1992 50 6641309502
+1993 50 6681847719
+1994 50 6374542648
+1995 50 6686329221
+1996 50 6841710204
+1997 50 6289013167
+1998 50 3751716318
+1992 51 6804275373
+1993 51 6208468595
+1994 51 6046395349
+1995 51 6352880587
+1996 51 6285475695
+1997 51 6393365859
+1998 51 3535193333
+1992 52 6053138673
+1993 52 6270772376
+1994 52 6156757875
+1995 52 6158310037
+1996 52 6164411328
+1997 52 6113882230
+1998 52 3381521312
+1992 53 6221067837
+1993 53 5932049757
+1994 53 6175099472
+1995 53 6256597213
+1996 53 6265574087
+1997 53 6452419277
+1998 53 3298965819
+1992 54 6085873522
+1993 54 6268707214
+1994 54 6109955822
+1995 54 6011700445
+1996 54 6233626966
+1997 54 5902666460
+1998 54 3464767326
+1992 55 6589852474
+1993 55 6507948375
+1994 55 6707389575
+1995 55 6118847814
+1996 55 6369111228
+1997 55 6161915041
+1998 55 3610193272
+1992 56 6129000282
+1993 56 5790679619
+1994 56 5826402917
+1995 56 5908836912
+1996 56 5616763903
+1997 56 5902947686
+1998 56 3112058250
+1992 57 5824635290
+1993 57 5876999663
+1994 57 5484431421
+1995 57 5880695547
+1996 57 5815600477
+1997 57 5642596426
+1998 57 3179720586
+1992 58 5993521981
+1993 58 5698429434
+1994 58 6045778708
+1995 58 5596770464
+1996 58 5602902570
+1997 58 5827168921
+1998 58 3614692390
+1992 59 5901946523
+1993 59 5848707519
+1994 59 6043292500
+1995 59 5689679375
+1996 59 5658105294
+1997 59 5744356971
+1998 59 3517431277
+1992 60 6253547701
+1993 60 6295488516
+1994 60 6247585910
+1995 60 5946652692
+1996 60 6332958799
+1997 60 6426981826
+1998 60 3538237841
+1992 61 6523908450
+1993 61 6266002951
+1994 61 6229473288
+1995 61 6433574643
+1996 61 6470033667
+1997 61 6160852695
+1998 61 3815286652
+1992 62 6155409813
+1993 62 5944781347
+1994 62 5647531260
+1995 62 6146349885
+1996 62 5874259231
+1997 62 5771092581
+1998 62 3848799359
+1992 63 6015144810
+1993 63 6644358780
+1994 63 6303769219
+1995 63 6487157609
+1996 63 6260201621
+1997 63 5936323834
+1998 63 3746060156
+1992 64 6605730121
+1993 64 6375799970
+1994 64 6362984117
+1995 64 6166610415
+1996 64 6298505754
+1997 64 6795100051
+1998 64 4038091656
+1992 65 7082841759
+1993 65 7045037152
+1994 65 6308495084
+1995 65 6451506098
+1996 65 6985524790
+1997 65 7045234117
+1998 65 4103210270
+1992 66 6587314235
+1993 66 6717880192
+1994 66 6931875539
+1995 66 6823692842
+1996 66 6778966019
+1997 66 6938134368
+1998 66 3996123557
+1992 67 6456838047
+1993 67 6496616174
+1994 67 6208530373
+1995 67 6304469051
+1996 67 5895856332
+1997 67 6512779478
+1998 67 3797955327
+1992 68 6482867338
+1993 68 6522458684
+1994 68 6688539088
+1995 68 6975990289
+1996 68 6734296742
+1997 68 6443099863
+1998 68 3696911793
+1992 69 5850155722
+1993 69 6159169202
+1994 69 6179753713
+1995 69 6341671954
+1996 69 5846097201
+1997 69 6138039450
+1998 69 3430651458
+1992 70 6527884145
+1993 70 6341113918
+1994 70 6423480229
+1995 70 6493732171
+1996 70 6558022303
+1997 70 6502432501
+1998 70 4138272278
+1992 71 5977566231
+1993 71 6246083360
+1994 71 6589979243
+1995 71 6362728981
+1996 71 6331903373
+1997 71 6317891561
+1998 71 3970794721
+1992 72 6162472854
+1993 72 6102788096
+1994 72 5758527506
+1995 72 6073178181
+1996 72 5787199821
+1997 72 5752890900
+1998 72 3650913701
+1992 73 6040081567
+1993 73 5700964701
+1994 73 6185333070
+1995 73 6013769902
+1996 73 5662668717
+1997 73 5735470645
+1998 73 3598396119
+1992 74 6559831216
+1993 74 6527962564
+1994 74 6120118269
+1995 74 6043130080
+1996 74 6314023403
+1997 74 6351296659
+1998 74 3833513192
+1992 75 6063071801
+1993 75 6063519843
+1994 75 5998734727
+1995 75 5943670683
+1996 75 5750034048
+1997 75 6363803744
+1998 75 3522552499
+1992 76 6499618869
+1993 76 6633266405
+1994 76 6753951730
+1995 76 6501112403
+1996 76 6848513046
+1997 76 6758217754
+1998 76 3831187933
+1992 77 6306363056
+1993 77 6140348079
+1994 77 6221542876
+1995 77 6546487305
+1996 77 6274545265
+1997 77 6065634372
+1998 77 3740445348
+1992 78 5440997205
+1993 78 5646255796
+1994 78 5621336499
+1995 78 5559009358
+1996 78 5714095414
+1997 78 5668627674
+1998 78 3164884845
+1992 79 6429617430
+1993 79 6453234687
+1994 79 6437225149
+1995 79 6089389380
+1996 79 6421073409
+1997 79 6218229922
+1998 79 3547510598
+Res Count: 280
+Time Taken Total: 31420.3
+{"query":21,"time_query":31420.6}
+==PROF== Disconnected from process 3281
+[3281] fls_q21_bitpacked@127.0.0.1
+  void build_hashtable_s<(int)32, (int)32>(int *, int *, int, int *, int) (20, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.77
+    SM Frequency            cycle/usecond       558.45
+    Elapsed Cycles                  cycle         9797
+    Memory Throughput                   %         5.98
+    DRAM Throughput                     %         5.98
+    Duration                      usecond        17.54
+    L1/TEX Cache Throughput             %         4.32
+    L2 Cache Throughput                 %         1.77
+    SM Active Cycles                cycle      4160.68
+    Compute (SM) Throughput             %         1.44
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.13
+    Executed Ipc Elapsed  inst/cycle         0.06
+    Issue Slots Busy               %         3.39
+    Issued Ipc Active     inst/cycle         0.14
+    SM Busy                        %         3.39
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.5%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        18.26
+    Mem Busy                     %         1.77
+    Max Bandwidth                %         5.98
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        14.87
+    Mem Pipes Busy               %         0.98
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.87
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        87.13
+    Active Warps Per Scheduler          warp         0.95
+    Eligible Warps Per Scheduler        warp         0.13
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.13%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.95 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         7.35
+    Warp Cycles Per Executed Instruction           cycle         7.40
+    Avg. Active Threads Per Warp                                13.33
+    Avg. Not Predicated Off Threads Per Warp                    12.37
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 47.16%                                                                                          
+          On average, each warp of this kernel spends 3.5 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 47.2% of the total average of 7.3 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.8848%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.3 threads being active per cycle. This is further reduced    
+          to 12.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       140.24
+    Executed Instructions                           inst        22438
+    Avg. Issued Instructions Per Scheduler          inst       141.24
+    Issued Instructions                             inst        22599
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     20
+    Registers Per Thread             register/thread              72
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 20 blocks, which is less than the GPU's 40             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           28
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.13%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared    
+          memory.                                                                                                       
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst         2027
+    Branch Efficiency                   %        13.94
+    Avg. Divergent Branches                       3.78
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.404%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)32, (int)32>(int *, int *, int *, int, int *, int) (782, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.32
+    SM Frequency            cycle/usecond       506.25
+    Elapsed Cycles                  cycle        36809
+    Memory Throughput                   %        72.08
+    DRAM Throughput                     %        72.08
+    Duration                      usecond        72.70
+    L1/TEX Cache Throughput             %        24.80
+    L2 Cache Throughput                 %        21.08
+    SM Active Cycles                cycle     33280.03
+    Compute (SM) Throughput             %        15.25
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.58
+    Executed Ipc Elapsed  inst/cycle         0.53
+    Issue Slots Busy               %        14.64
+    Issued Ipc Active     inst/cycle         0.59
+    SM Busy                        %        14.64
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.1%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       199.40
+    Mem Busy                     %        21.08
+    Max Bandwidth                %        72.08
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         8.90
+    Mem Pipes Busy               %        15.25
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.214%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        13.37
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        86.63
+    Active Warps Per Scheduler          warp         2.98
+    Eligible Warps Per Scheduler        warp         0.17
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 27.92%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.5 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          2.98 active warps per scheduler, but only an average of 0.17 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        22.25
+    Warp Cycles Per Executed Instruction           cycle        22.34
+    Avg. Active Threads Per Warp                                12.51
+    Avg. Not Predicated Off Threads Per Warp                    12.30
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 27.92%                                                                                          
+          On average, each warp of this kernel spends 8.1 cycles being stalled waiting for the L1 instruction queue for 
+          local and global (LG) memory operations to be not full. Typically, this stall occurs only when executing      
+          local or global memory instructions extremely frequently. Avoid redundant global memory accesses. Try to      
+          avoid using thread-local memory by checking if dynamically indexed arrays are declared in local scope, of if  
+          the kernel has excessive register pressure causing by spills. If applicable, consider combining multiple      
+          lower-width memory operations into fewer wider memory operations and try interleaving memory operations and   
+          math instructions. This stall type represents about 36.5% of the total average of 22.3 cycles between         
+          issuing two instructions.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 9.387%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 12.5 threads being active per cycle. This is further reduced    
+          to 12.3 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      4851.22
+    Executed Instructions                           inst       776195
+    Avg. Issued Instructions Per Scheduler          inst      4871.73
+    Issued Instructions                             inst       779477
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    782
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           25024
+    Waves Per SM                                                1.22
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %        40.07
+    Achieved Active Warps Per SM           warp        12.82
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 27.92%                                                                                          
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the number of required           
+          registers. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst        74518
+    Branch Efficiency                   %        37.41
+    Avg. Divergent Branches                     110.14
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.064%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 23924 excessive sectors (7% of the total  
+          360344 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)32, (int)32>(int *, int *, int, int *, int, int) (3, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.56
+    SM Frequency            cycle/usecond       536.34
+    Elapsed Cycles                  cycle         6266
+    Memory Throughput                   %         1.99
+    DRAM Throughput                     %         1.99
+    Duration                      usecond        11.68
+    L1/TEX Cache Throughput             %        25.74
+    L2 Cache Throughput                 %         1.18
+    SM Active Cycles                cycle       305.57
+    Compute (SM) Throughput             %         0.28
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.18
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         4.54
+    Issued Ipc Active     inst/cycle         0.18
+    SM Busy                        %         4.54
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.63%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         5.81
+    Mem Busy                     %         0.89
+    Max Bandwidth                %         1.99
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        55.20
+    Mem Pipes Busy               %         0.28
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.02918%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1019%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        16.30
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        83.70
+    Active Warps Per Scheduler          warp         0.88
+    Eligible Warps Per Scheduler        warp         0.16
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 83.7%                                                                                     
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.1 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.88 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         5.42
+    Warp Cycles Per Executed Instruction           cycle         5.49
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    27.49
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        13.71
+    Executed Instructions                           inst         2193
+    Avg. Issued Instructions Per Scheduler          inst        13.88
+    Issued Instructions                             inst         2221
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      3
+    Registers Per Thread             register/thread              80
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              96
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 92.5%                                                                                           
+          The grid for this launch is configured to execute only 3 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           24
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 83.7%                                                                                           
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared    
+          memory.                                                                                                       
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.04
+    Branch Instructions              inst           83
+    Branch Efficiency                   %        97.62
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.201%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)32, (int)32>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.98
+    SM Frequency            cycle/usecond       583.22
+    Elapsed Cycles                  cycle      4631746
+    Memory Throughput                   %        43.16
+    DRAM Throughput                     %        43.16
+    Duration                      msecond         7.94
+    L1/TEX Cache Throughput             %        45.08
+    L2 Cache Throughput                 %        14.80
+    SM Active Cycles                cycle   4608067.58
+    Compute (SM) Throughput             %        22.41
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.90
+    Executed Ipc Elapsed  inst/cycle         0.90
+    Issue Slots Busy               %        22.53
+    Issued Ipc Active     inst/cycle         0.90
+    SM Busy                        %        22.53
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 83.01%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       137.58
+    Mem Busy                     %        22.54
+    Max Bandwidth                %        43.16
+    L1/TEX Hit Rate              %        63.94
+    L2 Hit Rate                  %        38.12
+    Mem Pipes Busy               %        18.30
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.587%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.6 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 10.3 sectors per request, or 10.3*32 = 329.2 bytes of cache data transfers per request.   
+          The optimal thread address pattern for 4.6 byte accesses would result in 4.6*32 = 148.3 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 8.219%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.7 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1822%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        22.50
+    Issued Warp Per Scheduler                        0.22
+    No Eligible                            %        77.50
+    Active Warps Per Scheduler          warp         2.93
+    Eligible Warps Per Scheduler        warp         0.27
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 56.84%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          2.93 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.02
+    Warp Cycles Per Executed Instruction           cycle        13.03
+    Avg. Active Threads Per Warp                                20.13
+    Avg. Not Predicated Off Threads Per Warp                    18.56
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 39.33%                                                                                          
+          On average, each warp of this kernel spends 5.1 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 39.3% of the total average of 13.0 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 9.413%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.1 threads being active per cycle. This is further reduced    
+          to 18.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1038125.64
+    Executed Instructions                           inst    166100103
+    Avg. Issued Instructions Per Scheduler          inst   1038136.23
+    Issued Instructions                             inst    166101797
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             166
+    Shared Memory Configuration Size           Kbyte           65.54
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.10
+    Threads                                   thread         1874592
+    Waves Per SM                                              122.04
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           12
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           12
+    Theoretical Occupancy                     %        37.50
+    Achieved Occupancy                        %        36.71
+    Achieved Active Warps Per SM           warp        11.75
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 56.84%                                                                                          
+          The 3.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (37.5%) is limited by the number of required       
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst     14217715
+    Branch Efficiency                   %        54.52
+    Avg. Divergent Branches                   16653.46
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 60.9%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 56937633 excessive sectors (66% of the    
+          total 85739880 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal-fls/crystal_fls_q31_sf10.txt b/results/T4/crystal-fls/crystal_fls_q31_sf10.txt
new file mode 100644
index 0000000..e6bbab2
--- /dev/null
+++ b/results/T4/crystal-fls/crystal_fls_q31_sf10.txt
@@ -0,0 +1,939 @@
+==PROF== Connected to process 34912 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q31_bitpacked)
+[33m-- lo_custkey_min: 1[39m
+[33m-- lo_custkey_max: 299999[39m
+[33m-- x: 19[39m
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+Result:
+1992 8 8 55810032958
+1993 8 8 55419816844
+1994 8 8 54858143551
+1995 8 8 55052048227
+1996 8 8 55934455910
+1997 8 8 55029634139
+1992 9 8 59392902106
+1993 9 8 59125543423
+1994 9 8 58117083616
+1995 9 8 58449872713
+1996 9 8 59233979627
+1997 9 8 58441279090
+1992 12 8 57922369786
+1993 12 8 57770968742
+1994 12 8 58288999734
+1995 12 8 58083494498
+1996 12 8 58028640530
+1997 12 8 58384439424
+1992 18 8 58754091824
+1993 18 8 60132427921
+1994 18 8 57930388280
+1995 18 8 58580374657
+1996 18 8 60104499132
+1997 18 8 59540501870
+1992 21 8 57482969289
+1993 21 8 57196631632
+1994 21 8 56887893669
+1995 21 8 57359009401
+1996 21 8 57185726246
+1997 21 8 55770192252
+1992 8 9 60289620307
+1993 8 9 59669358608
+1994 8 9 59227963032
+1995 8 9 58083604342
+1996 8 9 59549633256
+1997 8 9 59226423899
+1992 9 9 63766872312
+1993 9 9 63450609364
+1994 9 9 63331300915
+1995 9 9 63061195588
+1996 9 9 63059702679
+1997 9 9 63852776026
+1992 12 9 62344531177
+1993 12 9 62565992442
+1994 12 9 62751696537
+1995 12 9 64005217794
+1996 12 9 63240925150
+1997 12 9 62768522335
+1992 18 9 63579886846
+1993 18 9 64007433384
+1994 18 9 64049090450
+1995 18 9 63593078482
+1996 18 9 64144972749
+1997 18 9 64316632938
+1992 21 9 62489017232
+1993 21 9 61251942124
+1994 21 9 62974158477
+1995 21 9 62194360745
+1996 21 9 63427038266
+1997 21 9 61646243962
+1992 8 12 56621226681
+1993 8 12 57388881032
+1994 8 12 58125827305
+1995 8 12 57289721321
+1996 8 12 58199587350
+1997 8 12 57255842192
+1992 9 12 60979743712
+1993 9 12 60418533333
+1994 9 12 62238770686
+1995 9 12 62168610917
+1996 9 12 61138073328
+1997 9 12 60949716596
+1992 12 12 60850079777
+1993 12 12 61121098249
+1994 12 12 59365844929
+1995 12 12 61306266372
+1996 12 12 60986975496
+1997 12 12 60764732195
+1992 18 12 61723423241
+1993 18 12 60358548529
+1994 18 12 62740786765
+1995 18 12 61356245530
+1996 18 12 61790338663
+1997 18 12 60775371251
+1992 21 12 60325249149
+1993 21 12 58753924379
+1994 21 12 60075724460
+1995 21 12 60195276711
+1996 21 12 59823166994
+1997 21 12 59290322956
+1992 8 18 61203981312
+1993 8 18 60533327021
+1994 8 18 59406890241
+1995 8 18 61032993971
+1996 8 18 60603415703
+1997 8 18 60996773423
+1992 9 18 65966808964
+1993 9 18 64754910872
+1994 9 18 65336592327
+1995 9 18 65462377211
+1996 9 18 65195236357
+1997 9 18 64985199873
+1992 12 18 65058022097
+1993 12 18 64739745546
+1994 12 18 65154462514
+1995 12 18 64552890307
+1996 12 18 65518312188
+1997 12 18 64052371252
+1992 18 18 64425556615
+1993 18 18 65900407107
+1994 18 18 65409639441
+1995 18 18 65334011363
+1996 18 18 65143825109
+1997 18 18 65086542879
+1992 21 18 63711417182
+1993 21 18 64173962924
+1994 21 18 63605939654
+1995 21 18 61974741198
+1996 21 18 63669933488
+1997 21 18 62601944454
+1992 8 21 50971039744
+1993 8 21 52220706010
+1994 8 21 50241741123
+1995 8 21 51395371603
+1996 8 21 52381894286
+1997 8 21 52011083842
+1992 9 21 54987586100
+1993 9 21 54025151851
+1994 9 21 54783712241
+1995 9 21 55184829281
+1996 9 21 54162547083
+1997 9 21 54119451445
+1992 12 21 54411375559
+1993 12 21 53875917105
+1994 12 21 53285064331
+1995 12 21 53705861117
+1996 12 21 54664539611
+1997 12 21 54738097793
+1992 18 21 55470492434
+1993 18 21 55405246901
+1994 18 21 54457040610
+1995 18 21 55091271737
+1996 18 21 55015868166
+1997 18 21 56394270254
+1992 21 21 54096226255
+1993 21 21 53624605015
+1994 21 21 53622935617
+1995 21 21 52905375721
+1996 21 21 54451741041
+1997 21 21 53282782506
+Res Count: 150
+Time Taken Total: 32097.9
+{"query":31,"time_query":32097.9}
+==PROF== Disconnected from process 34912
+[34912] fls_q31_bitpacked@127.0.0.1
+  void build_hashtable_s<(int)32, (int)32>(int *, int *, int *, int, int *, int) (20, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.61
+    SM Frequency            cycle/usecond       540.35
+    Elapsed Cycles                  cycle        10274
+    Memory Throughput                   %         8.91
+    DRAM Throughput                     %         8.91
+    Duration                      usecond        19.01
+    L1/TEX Cache Throughput             %         7.12
+    L2 Cache Throughput                 %         3.17
+    SM Active Cycles                cycle      4441.85
+    Compute (SM) Throughput             %         1.55
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.14
+    Executed Ipc Elapsed  inst/cycle         0.06
+    Issue Slots Busy               %         3.48
+    Issued Ipc Active     inst/cycle         0.14
+    SM Busy                        %         3.48
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.78%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        26.31
+    Mem Busy                     %         3.17
+    Max Bandwidth                %         8.91
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        30.82
+    Mem Pipes Busy               %         1.55
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.03441%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 153.8 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.2761%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.77
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        87.23
+    Active Warps Per Scheduler          warp         0.92
+    Eligible Warps Per Scheduler        warp         0.13
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.23%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.92 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         7.20
+    Warp Cycles Per Executed Instruction           cycle         7.25
+    Avg. Active Threads Per Warp                                13.96
+    Avg. Not Predicated Off Threads Per Warp                    13.08
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 48.34%                                                                                          
+          On average, each warp of this kernel spends 3.5 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 48.3% of the total average of 7.2 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.9173%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 14.0 threads being active per cycle. This is further reduced    
+          to 13.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       153.29
+    Executed Instructions                           inst        24526
+    Avg. Issued Instructions Per Scheduler          inst       154.41
+    Issued Instructions                             inst        24706
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     20
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 20 blocks, which is less than the GPU's 40             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.23%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the number of required           
+          registers. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst         2069
+    Branch Efficiency                   %        16.30
+    Avg. Divergent Branches                       3.79
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 13.74%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 4534 excessive sectors (34% of the total  
+          13506 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.     
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)32, (int)32>(int *, int *, int *, int, int *, int) (293, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.53
+    SM Frequency            cycle/usecond       531.83
+    Elapsed Cycles                  cycle        20019
+    Memory Throughput                   %        70.14
+    DRAM Throughput                     %        70.14
+    Duration                      usecond        37.63
+    L1/TEX Cache Throughput             %        50.52
+    L2 Cache Throughput                 %        22.06
+    SM Active Cycles                cycle     17373.85
+    Compute (SM) Throughput             %        11.78
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.52
+    Executed Ipc Elapsed  inst/cycle         0.45
+    Issue Slots Busy               %        13.06
+    Issued Ipc Active     inst/cycle         0.52
+    SM Busy                        %        13.06
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.49%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       203.39
+    Mem Busy                     %        22.06
+    Max Bandwidth                %        70.14
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        24.11
+    Mem Pipes Busy               %        11.78
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.2767%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 155.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.114%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        11.87
+    Issued Warp Per Scheduler                        0.12
+    No Eligible                            %        88.13
+    Active Warps Per Scheduler          warp         1.61
+    Eligible Warps Per Scheduler        warp         0.13
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 29.86%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 8.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          1.61 active warps per scheduler, but only an average of 0.13 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.56
+    Warp Cycles Per Executed Instruction           cycle        13.64
+    Avg. Active Threads Per Warp                                13.66
+    Avg. Not Predicated Off Threads Per Warp                    12.82
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.061%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.7 threads being active per cycle. This is further reduced    
+          to 12.8 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2255.26
+    Executed Instructions                           inst       360842
+    Avg. Issued Instructions Per Scheduler          inst      2269.41
+    Issued Instructions                             inst       363105
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    293
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            9376
+    Waves Per SM                                                0.46
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %        22.02
+    Achieved Active Warps Per SM           warp         7.05
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 29.86%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (22.0%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 29.86%                                                                                          
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the number of required           
+          registers. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst        30470
+    Branch Efficiency                   %        13.96
+    Avg. Divergent Branches                      56.74
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 29.55%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68642 excessive sectors (34% of the total 
+          203290 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)32, (int)32>(int *, int *, int, int *, int, int) (3, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.42
+    SM Frequency            cycle/usecond       518.66
+    Elapsed Cycles                  cycle         9146
+    Memory Throughput                   %         1.42
+    DRAM Throughput                     %         1.42
+    Duration                      usecond        17.63
+    L1/TEX Cache Throughput             %        13.21
+    L2 Cache Throughput                 %         0.74
+    SM Active Cycles                cycle       518.92
+    Compute (SM) Throughput             %         0.22
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.16
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         3.93
+    Issued Ipc Active     inst/cycle         0.16
+    SM Busy                        %         3.93
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.12%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         4.03
+    Mem Busy                     %         0.69
+    Max Bandwidth                %         1.42
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        52.25
+    Mem Pipes Busy               %         0.18
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.01799%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.0 sectors per request, or 9.0*32 = 289.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.06095%                                                                                        
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.4 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        13.86
+    Issued Warp Per Scheduler                        0.14
+    No Eligible                            %        86.14
+    Active Warps Per Scheduler          warp         0.87
+    Eligible Warps Per Scheduler        warp         0.14
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 86.14%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.2 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.87 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         6.29
+    Warp Cycles Per Executed Instruction           cycle         6.34
+    Avg. Active Threads Per Warp                                31.85
+    Avg. Not Predicated Off Threads Per Warp                    27.96
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 40.64%                                                                                          
+          On average, each warp of this kernel spends 2.6 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 40.6% of the total average of 6.3 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        20.21
+    Executed Instructions                           inst         3234
+    Avg. Issued Instructions Per Scheduler          inst        20.38
+    Issued Instructions                             inst         3260
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      3
+    Registers Per Thread             register/thread             112
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              96
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 92.5%                                                                                           
+          The grid for this launch is configured to execute only 3 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 86.14%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the number of required           
+          registers. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.07
+    Branch Instructions              inst          223
+    Branch Efficiency                   %        99.11
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 4.898%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 700 excessive sectors (37% of the total   
+          1888 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)32, (int)32>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.38
+    Elapsed Cycles                  cycle      6853895
+    Memory Throughput                   %        21.08
+    DRAM Throughput                     %        17.58
+    Duration                      msecond        11.73
+    L1/TEX Cache Throughput             %        42.16
+    L2 Cache Throughput                 %        14.55
+    SM Active Cycles                cycle   6828514.45
+    Compute (SM) Throughput             %        18.87
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.76
+    Executed Ipc Elapsed  inst/cycle         0.75
+    Issue Slots Busy               %        18.94
+    Issued Ipc Active     inst/cycle         0.76
+    SM Busy                        %        18.94
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 86.06%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        56.15
+    Mem Busy                     %        21.08
+    Max Bandwidth                %        17.58
+    L1/TEX Hit Rate              %        66.47
+    L2 Hit Rate                  %        67.67
+    Mem Pipes Busy               %        15.30
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.9745%                                                                                         
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          5.2 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.5 sectors per request, or 9.5*32 = 305.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 5.2 byte accesses would result in 5.2*32 = 165.4 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.17%                                                                                           
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.8 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.7099%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        18.90
+    Issued Warp Per Scheduler                        0.19
+    No Eligible                            %        81.10
+    Active Warps Per Scheduler          warp         1.95
+    Eligible Warps Per Scheduler        warp         0.21
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 78.92%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 5.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          1.95 active warps per scheduler, but only an average of 0.21 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        10.34
+    Warp Cycles Per Executed Instruction           cycle        10.34
+    Avg. Active Threads Per Warp                                17.49
+    Avg. Not Predicated Off Threads Per Warp                    16.10
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 33%                                                                                             
+          On average, each warp of this kernel spends 3.4 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 33.0% of the total average of 10.3 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 9.378%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 17.5 threads being active per cycle. This is further reduced    
+          to 16.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1293266.36
+    Executed Instructions                           inst    206922618
+    Avg. Issued Instructions Per Scheduler          inst   1293274.32
+    Issued Instructions                             inst    206923891
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             205
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.10
+    Threads                                   thread         1874592
+    Waves Per SM                                              183.07
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block            8
+    Block Limit Shared Mem                block            8
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp            8
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %        24.53
+    Achieved Active Warps Per SM           warp         7.85
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 75%                                                                                             
+          The 2.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (25.0%) is limited by the number of required       
+          registers. This kernel's theoretical occupancy (25.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst     15774779
+    Branch Efficiency                   %        28.33
+    Avg. Divergent Branches                   26239.46
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 52.44%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 53131796 excessive sectors (57% of the    
+          total 93609280 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal-fls/crystal_fls_q41_sf10.txt b/results/T4/crystal-fls/crystal_fls_q41_sf10.txt
new file mode 100644
index 0000000..86a6d0d
--- /dev/null
+++ b/results/T4/crystal-fls/crystal_fls_q41_sf10.txt
@@ -0,0 +1,976 @@
+==PROF== Connected to process 35734 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q41_bitpacked)
+[33m-- lo_supplycost_min: 54058[39m
+[33m-- lo_supplycost_max: 125939[39m
+[33m-- x: 17[39m
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 3: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 4: 0%....50%....100% - 35 passes
+Result:
+1992 1 95648156129
+1993 1 94729332349
+1994 1 95597424703
+1995 1 95805325063
+1996 1 94652122963
+1997 1 94478591046
+1998 1 56309816419
+1992 2 95342660133
+1993 2 95250025749
+1994 2 94195532230
+1995 2 95996690160
+1996 2 96142014806
+1997 2 93876031202
+1998 2 55831743925
+1992 3 96438974413
+1993 3 96408487670
+1994 3 96470291285
+1995 3 95787590173
+1996 3 95344337992
+1997 3 96140765753
+1998 3 57572082481
+1992 17 95490328560
+1993 17 94395796636
+1994 17 94889321105
+1995 17 95105380702
+1996 17 94835998843
+1997 17 93544195524
+1998 17 55465310361
+1992 24 102205824310
+1993 24 99557934518
+1994 24 101333360576
+1995 24 101617327862
+1996 24 101030018687
+1997 24 100380310171
+1998 24 58632896929
+Res Count: 35
+Time Taken Total: 34525.6
+{"query":41,"time_query":34525.6}
+==PROF== Disconnected from process 35734
+[35734] fls_q41_bitpacked@127.0.0.1
+  void build_hashtable_s<(int)32, (int)32>(int *, int *, int, int *, int) (20, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.71
+    SM Frequency            cycle/usecond       550.19
+    Elapsed Cycles                  cycle         9758
+    Memory Throughput                   %         6.05
+    DRAM Throughput                     %         6.05
+    Duration                      usecond        17.73
+    L1/TEX Cache Throughput             %         4.32
+    L2 Cache Throughput                 %         1.86
+    SM Active Cycles                cycle      4157.12
+    Compute (SM) Throughput             %         1.45
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.13
+    Executed Ipc Elapsed  inst/cycle         0.06
+    Issue Slots Busy               %         3.40
+    Issued Ipc Active     inst/cycle         0.14
+    SM Busy                        %         3.40
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.49%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        18.23
+    Mem Busy                     %         1.86
+    Max Bandwidth                %         6.05
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        17.16
+    Mem Pipes Busy               %         0.99
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.76
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        87.24
+    Active Warps Per Scheduler          warp         0.94
+    Eligible Warps Per Scheduler        warp         0.13
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.24%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.94 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         7.34
+    Warp Cycles Per Executed Instruction           cycle         7.39
+    Avg. Active Threads Per Warp                                13.33
+    Avg. Not Predicated Off Threads Per Warp                    12.37
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 47.21%                                                                                          
+          On average, each warp of this kernel spends 3.5 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 47.2% of the total average of 7.3 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.8884%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.3 threads being active per cycle. This is further reduced    
+          to 12.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       140.24
+    Executed Instructions                           inst        22438
+    Avg. Issued Instructions Per Scheduler          inst       141.24
+    Issued Instructions                             inst        22599
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     20
+    Registers Per Thread             register/thread              72
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 20 blocks, which is less than the GPU's 40             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           28
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.24%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared    
+          memory.                                                                                                       
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst         2027
+    Branch Efficiency                   %        13.94
+    Avg. Divergent Branches                       3.78
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.609%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)32, (int)32>(int *, int *, int *, int, int *, int) (293, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.46
+    SM Frequency            cycle/usecond       523.31
+    Elapsed Cycles                  cycle        19649
+    Memory Throughput                   %        71.45
+    DRAM Throughput                     %        71.45
+    Duration                      usecond        37.54
+    L1/TEX Cache Throughput             %        51.17
+    L2 Cache Throughput                 %        22.35
+    SM Active Cycles                cycle     17187.80
+    Compute (SM) Throughput             %        12.00
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.52
+    Executed Ipc Elapsed  inst/cycle         0.46
+    Issue Slots Busy               %        13.20
+    Issued Ipc Active     inst/cycle         0.53
+    SM Busy                        %        13.20
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.4%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       204.13
+    Mem Busy                     %        22.35
+    Max Bandwidth                %        71.45
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        24.07
+    Mem Pipes Busy               %        12.00
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.2721%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.144%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        11.83
+    Issued Warp Per Scheduler                        0.12
+    No Eligible                            %        88.17
+    Active Warps Per Scheduler          warp         1.58
+    Eligible Warps Per Scheduler        warp         0.13
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 28.55%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 8.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          1.58 active warps per scheduler, but only an average of 0.13 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.36
+    Warp Cycles Per Executed Instruction           cycle        13.44
+    Avg. Active Threads Per Warp                                13.61
+    Avg. Not Predicated Off Threads Per Warp                    12.78
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.208%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.6 threads being active per cycle. This is further reduced    
+          to 12.8 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2254.43
+    Executed Instructions                           inst       360709
+    Avg. Issued Instructions Per Scheduler          inst      2268.80
+    Issued Instructions                             inst       363008
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    293
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            9376
+    Waves Per SM                                                0.46
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %        22.11
+    Achieved Active Warps Per SM           warp         7.07
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 28.55%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (22.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 28.55%                                                                                          
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the number of required           
+          registers. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst        30465
+    Branch Efficiency                   %        14.02
+    Avg. Divergent Branches                      56.71
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 29.97%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68196 excessive sectors (34% of the total 
+          202634 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)32, (int)32>(int *, int *, int, int *, int) (782, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.51
+    SM Frequency            cycle/usecond       527.85
+    Elapsed Cycles                  cycle        34363
+    Memory Throughput                   %        74.61
+    DRAM Throughput                     %        74.61
+    Duration                      usecond        65.09
+    L1/TEX Cache Throughput             %        47.09
+    L2 Cache Throughput                 %        20.22
+    SM Active Cycles                cycle     32290.62
+    Compute (SM) Throughput             %        17.42
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.74
+    Executed Ipc Elapsed  inst/cycle         0.69
+    Issue Slots Busy               %        18.54
+    Issued Ipc Active     inst/cycle         0.74
+    SM Busy                        %        18.54
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 85.33%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       215.24
+    Mem Busy                     %        19.31
+    Max Bandwidth                %        74.61
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         6.87
+    Mem Pipes Busy               %        11.04
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        17.42
+    Issued Warp Per Scheduler                        0.17
+    No Eligible                            %        82.58
+    Active Warps Per Scheduler          warp         2.93
+    Eligible Warps Per Scheduler        warp         0.21
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 25.39%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 5.7 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          2.93 active warps per scheduler, but only an average of 0.21 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        16.83
+    Warp Cycles Per Executed Instruction           cycle        16.89
+    Avg. Active Threads Per Warp                                18.10
+    Avg. Not Predicated Off Threads Per Warp                    16.43
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.479%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 18.1 threads being active per cycle. This is further reduced    
+          to 16.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      5965.29
+    Executed Instructions                           inst       954446
+    Avg. Issued Instructions Per Scheduler          inst      5986.18
+    Issued Instructions                             inst       957788
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    782
+    Registers Per Thread             register/thread              78
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           25024
+    Waves Per SM                                                1.22
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the    
+          target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical       
+          occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 142 thread blocks.  
+          Under the assumption of a uniform execution duration of all thread blocks, the partial wave may account for   
+          up to 50.0% of the total kernel runtime with a lower occupancy of 21.7%. Try launching a grid with no         
+          partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for  
+          a grid. See the Hardware Model                                                                                
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more      
+          details on launch configurations.                                                                             
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           24
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %        39.17
+    Achieved Active Warps Per SM           warp        12.53
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 21.66%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (39.2%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 25.39%                                                                                          
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared    
+          memory.                                                                                                       
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst        81308
+    Branch Efficiency                   %        13.98
+    Avg. Divergent Branches                     151.37
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 17.31%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 57145 excessive sectors (19% of the total 
+          308036 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)32, (int)32>(int *, int *, int, int *, int, int) (3, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.34
+    SM Frequency            cycle/usecond       511.05
+    Elapsed Cycles                  cycle         6266
+    Memory Throughput                   %         2.03
+    DRAM Throughput                     %         2.03
+    Duration                      usecond        12.26
+    L1/TEX Cache Throughput             %        26.28
+    L2 Cache Throughput                 %         1.12
+    SM Active Cycles                cycle       299.45
+    Compute (SM) Throughput             %         0.28
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.18
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         4.64
+    Issued Ipc Active     inst/cycle         0.19
+    SM Busy                        %         4.64
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.56%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         5.64
+    Mem Busy                     %         0.88
+    Max Bandwidth                %         2.03
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        57.06
+    Mem Pipes Busy               %         0.28
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.02919%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.0978%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.34
+    Issued Warp Per Scheduler                        0.15
+    No Eligible                            %        84.66
+    Active Warps Per Scheduler          warp         0.82
+    Eligible Warps Per Scheduler        warp         0.15
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.66%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.5 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.82 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         5.32
+    Warp Cycles Per Executed Instruction           cycle         5.39
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    27.49
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        13.71
+    Executed Instructions                           inst         2193
+    Avg. Issued Instructions Per Scheduler          inst        13.88
+    Issued Instructions                             inst         2221
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      3
+    Registers Per Thread             register/thread              80
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              96
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 92.5%                                                                                           
+          The grid for this launch is configured to execute only 3 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           24
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           50
+    Achieved Occupancy                        %         3.12
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 84.66%                                                                                          
+          The difference between calculated theoretical (50.0%) and measured achieved occupancy (3.1%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (50.0%) is limited by the number of blocks that    
+          can fit on the SM. This kernel's theoretical occupancy (50.0%) is limited by the required amount of shared    
+          memory.                                                                                                       
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.04
+    Branch Instructions              inst           83
+    Branch Efficiency                   %        97.62
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.727%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)32, (int)32>(int *, int *, int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *, int, int *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.03
+    Elapsed Cycles                  cycle      8907789
+    Memory Throughput                   %        25.29
+    DRAM Throughput                     %        25.29
+    Duration                      msecond        15.25
+    L1/TEX Cache Throughput             %        26.85
+    L2 Cache Throughput                 %         9.44
+    SM Active Cycles                cycle   8877244.35
+    Compute (SM) Throughput             %        16.59
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.67
+    Executed Ipc Elapsed  inst/cycle         0.66
+    Issue Slots Busy               %        16.64
+    Issued Ipc Active     inst/cycle         0.67
+    SM Busy                        %        16.64
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 88.07%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        80.72
+    Mem Busy                     %        13.42
+    Max Bandwidth                %        25.29
+    L1/TEX Hit Rate              %        61.29
+    L2 Hit Rate                  %        32.78
+    Mem Pipes Busy               %        13.87
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.7068%                                                                                         
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.5 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 7.9 sectors per request, or 7.9*32 = 254.3 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.5 byte accesses would result in 4.5*32 = 144.5 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 4.407%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1486%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.1 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        16.70
+    Issued Warp Per Scheduler                        0.17
+    No Eligible                            %        83.30
+    Active Warps Per Scheduler          warp         1.99
+    Eligible Warps Per Scheduler        warp         0.19
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 74.71%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.0 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          1.99 active warps per scheduler, but only an average of 0.19 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        11.89
+    Warp Cycles Per Executed Instruction           cycle        11.89
+    Avg. Active Threads Per Warp                                18.07
+    Avg. Not Predicated Off Threads Per Warp                    16.92
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 38.14%                                                                                          
+          On average, each warp of this kernel spends 4.5 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 38.1% of the total average of 11.9 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.816%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 18.1 threads being active per cycle. This is further reduced    
+          to 16.9 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1477385.36
+    Executed Instructions                           inst    236381658
+    Avg. Issued Instructions Per Scheduler          inst   1477393.64
+    Issued Instructions                             inst    236382983
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             224
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.10
+    Threads                                   thread         1874592
+    Waves Per SM                                              183.07
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block            8
+    Block Limit Shared Mem                block            8
+    Block Limit Warps                     block           32
+    Theoretical Active Warps per SM        warp            8
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %        24.78
+    Achieved Active Warps Per SM           warp         7.93
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 74.71%                                                                                          
+          The 2.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 8. This kernel's theoretical occupancy (25.0%) is limited by the number of required       
+          registers. This kernel's theoretical occupancy (25.0%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst     19769372
+    Branch Efficiency                   %        45.84
+    Avg. Divergent Branches                   26771.21
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 45.13%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 55173969 excessive sectors (59% of the    
+          total 93609534 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal-opt/crystal_opt_q21.txt b/results/T4/crystal-opt/crystal_opt_q21.txt
new file mode 100644
index 0000000..9fa8351
--- /dev/null
+++ b/results/T4/crystal-opt/crystal_opt_q21.txt
@@ -0,0 +1,984 @@
+==PROF== Connected to process 21469 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q21)
+Using device 0: Tesla T4 (PTX version 750, SM750, 40 SMs, 14802 free / 14929 total MB physmem, 320.064 GB/s @ 5001000 kHz mem clock, ECC on)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+1992 40 620792484
+1993 40 696134057
+1994 40 646183449
+1995 40 683789210
+1996 40 510952202
+1997 40 601329805
+1998 40 310212681
+1992 41 619072460
+1993 41 641906797
+1994 41 600001881
+1995 41 538237825
+1996 41 626948592
+1997 41 599768514
+1998 41 324844587
+1992 42 490930542
+1993 42 590494981
+1994 42 637973270
+1995 42 563917939
+1996 42 489891798
+1997 42 575414713
+1998 42 382795528
+1992 43 746180561
+1993 43 667809574
+1994 43 582681681
+1995 43 622468901
+1996 43 615832436
+1997 43 665181553
+1998 43 391879175
+1992 44 592643829
+1993 44 547150428
+1994 44 687417394
+1995 44 557813495
+1996 44 539509100
+1997 44 591240347
+1998 44 339362761
+1992 45 666818456
+1993 45 614006733
+1994 45 553771550
+1995 45 718580318
+1996 45 604069580
+1997 45 582504582
+1998 45 418644887
+1992 46 497930218
+1993 46 603429138
+1994 46 570930370
+1995 46 705815966
+1996 46 595735717
+1997 46 652582336
+1998 46 460900222
+1992 47 573950112
+1993 47 581793154
+1994 47 555531717
+1995 47 684622773
+1996 47 655735847
+1997 47 644209268
+1998 47 368241619
+1992 48 629436211
+1993 48 732890335
+1994 48 616859338
+1995 48 687199268
+1996 48 741147488
+1997 48 755567624
+1998 48 436528223
+1992 49 665878691
+1993 49 584135775
+1994 49 589016154
+1995 49 607002298
+1996 49 563497671
+1997 49 545937355
+1998 49 309175941
+1992 50 553577586
+1993 50 555705138
+1994 50 676062441
+1995 50 641710612
+1996 50 575388404
+1997 50 578436711
+1998 50 314285647
+1992 51 656700076
+1993 51 627130013
+1994 51 591728980
+1995 51 657535469
+1996 51 658819802
+1997 51 654743943
+1998 51 391790954
+1992 52 571366086
+1993 52 610589229
+1994 52 544138184
+1995 52 529390478
+1996 52 612212648
+1997 52 520737479
+1998 52 253877981
+1992 53 532838175
+1993 53 556392755
+1994 53 506771410
+1995 53 567384149
+1996 53 471880515
+1997 53 589317682
+1998 53 286129583
+1992 54 575100090
+1993 54 598048745
+1994 54 669399089
+1995 54 673188844
+1996 54 607298942
+1997 54 682448220
+1998 54 396578150
+1992 55 575644119
+1993 55 538626597
+1994 55 641068147
+1995 55 683443283
+1996 55 628222285
+1997 55 578103277
+1998 55 348596079
+1992 56 562595481
+1993 56 572869443
+1994 56 523516106
+1995 56 534863977
+1996 56 536099358
+1997 56 590451889
+1998 56 317284773
+1992 57 514802952
+1993 57 493315679
+1994 57 599287565
+1995 57 596024828
+1996 57 615338121
+1997 57 598601936
+1998 57 308349815
+1992 58 543346708
+1993 58 604020487
+1994 58 515506085
+1995 58 599834564
+1996 58 517842408
+1997 58 608170121
+1998 58 341434816
+1992 59 491786465
+1993 59 655668497
+1994 59 655183200
+1995 59 584917742
+1996 59 559185452
+1997 59 576734822
+1998 59 326633797
+1992 60 602373460
+1993 60 615880897
+1994 60 643804380
+1995 60 713302883
+1996 60 623220244
+1997 60 680711137
+1998 60 354376769
+1992 61 633928973
+1993 61 565901926
+1994 61 647661017
+1995 61 647289672
+1996 61 637768457
+1997 61 593124378
+1998 61 380715354
+1992 62 554701238
+1993 62 565208933
+1994 62 718895078
+1995 62 609303895
+1996 62 691969792
+1997 62 631016696
+1998 62 358310182
+1992 63 546043513
+1993 63 660789968
+1994 63 655833720
+1995 63 702057957
+1996 63 653344348
+1997 63 550179447
+1998 63 419353251
+1992 64 647297520
+1993 64 582390534
+1994 64 529474222
+1995 64 560461020
+1996 64 591003083
+1997 64 564085649
+1998 64 398848738
+1992 65 799379105
+1993 65 576848715
+1994 65 636493983
+1995 65 713329066
+1996 65 633922964
+1997 65 684284629
+1998 65 335073096
+1992 66 795689545
+1993 66 759898311
+1994 66 697404326
+1995 66 693856011
+1996 66 605367841
+1997 66 682817524
+1998 66 372528215
+1992 67 648627112
+1993 67 649305965
+1994 67 543254019
+1995 67 737599852
+1996 67 646443167
+1997 67 703348298
+1998 67 356128002
+1992 68 608555802
+1993 68 573449583
+1994 68 610859739
+1995 68 628687768
+1996 68 689535294
+1997 68 638125635
+1998 68 387752384
+1992 69 616083074
+1993 69 603750123
+1994 69 566272871
+1995 69 693347954
+1996 69 621193535
+1997 69 569915068
+1998 69 371569162
+1992 70 627072231
+1993 70 554942415
+1994 70 736308788
+1995 70 589463137
+1996 70 701770686
+1997 70 561626445
+1998 70 378101727
+1992 71 710702139
+1993 71 699720829
+1994 71 666578569
+1995 71 576221762
+1996 71 535280597
+1997 71 628168690
+1998 71 405952025
+1992 72 595000850
+1993 72 596286095
+1994 72 590361006
+1995 72 665356177
+1996 72 595905720
+1997 72 624894859
+1998 72 373008532
+1992 73 525131548
+1993 73 594835274
+1994 73 586002871
+1995 73 533249668
+1996 73 585914955
+1997 73 478354667
+1998 73 362991667
+1992 74 662752933
+1993 74 632459703
+1994 74 662533721
+1995 74 709515121
+1996 74 661386832
+1997 74 611544878
+1998 74 322587523
+1992 75 518515033
+1993 75 601539100
+1994 75 551806661
+1995 75 601270873
+1996 75 600967332
+1997 75 466864598
+1998 75 274934885
+1992 76 775269032
+1993 76 773058041
+1994 76 730757877
+1995 76 675467554
+1996 76 814445849
+1997 76 780193108
+1998 76 423598818
+1992 77 633158355
+1993 77 713278246
+1994 77 620767558
+1995 77 634874801
+1996 77 704631788
+1997 77 639898919
+1998 77 392782198
+1992 78 512420094
+1993 78 523847906
+1994 78 529135579
+1995 78 578182924
+1996 78 511907354
+1997 78 570804688
+1998 78 370584319
+1992 79 709935019
+1993 79 539722143
+1994 79 596339358
+1995 79 697095930
+1996 79 783593202
+1997 79 652546402
+1998 79 435709816
+Res Count: 280
+Time Taken Total: 25185.4
+{"query":21,"time_query":25185.4}
+==PROF== Disconnected from process 21469
+[21469] crystal_opt_q21@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (63, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.79
+    SM Frequency            cycle/usecond       556.02
+    Elapsed Cycles                  cycle         3811
+    Memory Throughput                   %        14.42
+    DRAM Throughput                     %        14.42
+    Duration                      usecond         6.85
+    L1/TEX Cache Throughput             %        12.70
+    L2 Cache Throughput                 %         5.09
+    SM Active Cycles                cycle      1615.80
+    Compute (SM) Throughput             %         3.19
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.2 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.26
+    Executed Ipc Elapsed  inst/cycle         0.11
+    Issue Slots Busy               %         7.40
+    Issued Ipc Active     inst/cycle         0.30
+    SM Busy                        %         7.51
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.02%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        44.15
+    Mem Busy                     %         5.09
+    Max Bandwidth                %        14.42
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        12.48
+    Mem Pipes Busy               %         3.05
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.92
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        93.08
+    Active Warps Per Scheduler          warp         1.47
+    Eligible Warps Per Scheduler        warp         0.08
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 85.58%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.4 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          1.47 active warps per scheduler, but only an average of 0.08 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        21.25
+    Warp Cycles Per Executed Instruction           cycle        24.10
+    Avg. Active Threads Per Warp                                29.44
+    Avg. Not Predicated Off Threads Per Warp                    29.28
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       105.38
+    Executed Instructions                           inst        16861
+    Avg. Issued Instructions Per Scheduler          inst       119.51
+    Issued Instructions                             inst        19121
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     63
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            8064
+    Waves Per SM                                                0.20
+    -------------------------------- --------------- ---------------
+
+    OPT   If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have more than the 
+          achieved 1 blocks per multiprocessor. This way, blocks that aren't waiting for __syncthreads() can keep the   
+          hardware busy.                                                                                                
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        19.06
+    Achieved Active Warps Per SM           warp         6.10
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 80.94%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (19.1%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.21
+    Branch Instructions              inst         3603
+    Branch Efficiency                   %        97.30
+    Avg. Divergent Branches                       0.30
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 0.4533%                                                                                         
+          This kernel has uncoalesced global accesses resulting in a total of 136 excessive sectors (2% of the total    
+          8209 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int *, int, int *, int) (1954, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.48
+    SM Frequency            cycle/usecond       525.07
+    Elapsed Cycles                  cycle        30400
+    Memory Throughput                   %        85.69
+    DRAM Throughput                     %        85.69
+    Duration                      usecond        57.89
+    L1/TEX Cache Throughput             %        33.22
+    L2 Cache Throughput                 %        27.53
+    SM Active Cycles                cycle     28782.75
+    Compute (SM) Throughput             %        18.21
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.61
+    Executed Ipc Elapsed  inst/cycle         0.58
+    Issue Slots Busy               %        15.40
+    Issued Ipc Active     inst/cycle         0.62
+    SM Busy                        %        15.40
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.47%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       245.55
+    Mem Busy                     %        27.53
+    Max Bandwidth                %        85.69
+    L1/TEX Hit Rate              %         0.00
+    L2 Hit Rate                  %         2.28
+    Mem Pipes Busy               %        18.21
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.3676%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        13.80
+    Issued Warp Per Scheduler                        0.14
+    No Eligible                            %        86.20
+    Active Warps Per Scheduler          warp         6.61
+    Eligible Warps Per Scheduler        warp         0.18
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 14.31%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.2 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.61 active warps per scheduler, but only an average of 0.18 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        47.92
+    Warp Cycles Per Executed Instruction           cycle        48.44
+    Avg. Active Threads Per Warp                                26.34
+    Avg. Not Predicated Off Threads Per Warp                    26.28
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 14.31%                                                                                          
+          On average, each warp of this kernel spends 21.3 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 44.5% of the total average of 47.9 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 14.31%                                                                                          
+          On average, each warp of this kernel spends 19.5 cycles being stalled after EXIT waiting for all outstanding  
+          memory operations to complete so that warp's resources can be freed. A high number of stalls due to draining  
+          warps typically occurs when a lot of data is written to memory towards the end of a kernel. Make sure the     
+          memory access patterns of these store operations are optimal for the target architecture and consider         
+          parallelized data reduction, if applicable. This stall type represents about 40.8% of the total average of    
+          47.9 cycles between issuing two instructions.                                                                 
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      4385.65
+    Executed Instructions                           inst       701704
+    Avg. Issued Instructions Per Scheduler          inst      4433.79
+    Issued Instructions                             inst       709406
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1954
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          250112
+    Waves Per SM                                                6.11
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.99
+    Achieved Active Warps Per SM           warp        29.44
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.18
+    Branch Instructions              inst       129629
+    Branch Efficiency                   %        94.56
+    Avg. Divergent Branches                      21.28
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 1.409%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 5904 excessive sectors (2% of the total   
+          390018 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.68
+    SM Frequency            cycle/usecond       551.85
+    Elapsed Cycles                  cycle         3216
+    Memory Throughput                   %         3.84
+    DRAM Throughput                     %         3.84
+    Duration                      usecond         5.82
+    L1/TEX Cache Throughput             %        30.39
+    L2 Cache Throughput                 %         2.13
+    SM Active Cycles                cycle       251.72
+    Compute (SM) Throughput             %         0.57
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.28
+    Executed Ipc Elapsed  inst/cycle         0.02
+    Issue Slots Busy               %         7.31
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.31
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.01%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        11.51
+    Mem Busy                     %         1.72
+    Max Bandwidth                %         3.84
+    L1/TEX Hit Rate              %         4.36
+    L2 Hit Rate                  %        49.69
+    Mem Pipes Busy               %         0.53
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.05677%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1415%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.8 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.07
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.93
+    Active Warps Per Scheduler          warp         0.94
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.93%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.1 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.94 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.26
+    Warp Cycles Per Executed Instruction           cycle        14.04
+    Avg. Active Threads Per Warp                                31.92
+    Avg. Not Predicated Off Threads Per Warp                    28.58
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        17.39
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst        18.41
+    Issued Instructions                             inst         2946
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 87.5%                                                                                           
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.18
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.82%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.765%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (187479, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.98
+    SM Frequency            cycle/usecond       582.87
+    Elapsed Cycles                  cycle      1015929
+    Memory Throughput                   %        87.95
+    DRAM Throughput                     %        87.95
+    Duration                      msecond         1.74
+    L1/TEX Cache Throughput             %        48.14
+    L2 Cache Throughput                 %        29.43
+    SM Active Cycles                cycle   1013809.50
+    Compute (SM) Throughput             %        56.91
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         2.28
+    Executed Ipc Elapsed  inst/cycle         2.28
+    Issue Slots Busy               %        57.03
+    Issued Ipc Active     inst/cycle         2.28
+    SM Busy                        %        57.03
+    -------------------- ----------- ------------
+
+    INF   ALU is the highest-utilized pipeline (47.9%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes integer and logic operations. It is well-utilized, but should not be a    
+          bottleneck.                                                                                                   
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       280.18
+    Mem Busy                     %        29.43
+    Max Bandwidth                %        87.95
+    L1/TEX Hit Rate              %        37.85
+    L2 Hit Rate                  %         8.77
+    Mem Pipes Busy               %        36.99
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 6.416%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 3.1 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.09378%                                                                                        
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 80.85%                                                                                          
+          The memory access pattern for loads from device memory causes 13,016,122 sectors to be read from DRAM, which  
+          is 1.0x of the 12,757,527 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        56.83
+    Issued Warp Per Scheduler                        0.57
+    No Eligible                            %        43.17
+    Active Warps Per Scheduler          warp         6.46
+    Eligible Warps Per Scheduler        warp         1.18
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 12.05%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 1.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.46 active warps per scheduler, but only an average of 1.18 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        11.37
+    Warp Cycles Per Executed Instruction           cycle        11.37
+    Avg. Active Threads Per Warp                                28.80
+    Avg. Not Predicated Off Threads Per Warp                    25.56
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 12.05%                                                                                          
+          On average, each warp of this kernel spends 4.9 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 43.5% of the total average of 11.4 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    578107.31
+    Executed Instructions                           inst     92497169
+    Avg. Issued Instructions Per Scheduler          inst    578158.86
+    Issued Instructions                             inst     92505418
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 187479
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        23997312
+    Waves Per SM                                              585.87
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        82.08
+    Achieved Active Warps Per SM           warp        26.27
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 12.05%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (82.1%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst      5150651
+    Branch Efficiency                   %        89.24
+    Avg. Divergent Branches                    1391.69
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 27.04%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 6096381 excessive sectors (27% of the     
+          total 22464399 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal-opt/crystal_opt_q21_sf10.txt b/results/T4/crystal-opt/crystal_opt_q21_sf10.txt
new file mode 100644
index 0000000..bbccc81
--- /dev/null
+++ b/results/T4/crystal-opt/crystal_opt_q21_sf10.txt
@@ -0,0 +1,1008 @@
+==PROF== Connected to process 26756 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q21)
+Using device 0: Tesla T4 (PTX version 750, SM750, 40 SMs, 14802 free / 14929 total MB physmem, 320.064 GB/s @ 5001000 kHz mem clock, ECC on)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+1992 40 6574868694
+1993 40 6952043914
+1994 40 6525239576
+1995 40 6764559245
+1996 40 6725548424
+1997 40 6596102991
+1998 40 3988851825
+1992 41 7047701749
+1993 41 6909841940
+1994 41 6978800980
+1995 41 7036474627
+1996 41 7233045193
+1997 41 6938053628
+1998 41 4065391978
+1992 42 6450484539
+1993 42 6886094182
+1994 42 6852294265
+1995 42 6749813918
+1996 42 6568551778
+1997 42 6845017761
+1998 42 3773836113
+1992 43 6918393482
+1993 43 6621428714
+1994 43 7068738463
+1995 43 6820930145
+1996 43 6762634261
+1997 43 6849537060
+1998 43 3882704011
+1992 44 6343659176
+1993 44 6094791212
+1994 44 6661136530
+1995 44 6085276694
+1996 44 6176324016
+1997 44 6315911460
+1998 44 3925731952
+1992 45 6499025385
+1993 45 6779833973
+1994 45 6435942251
+1995 45 6738626764
+1996 45 6763207154
+1997 45 6889101910
+1998 45 3879170338
+1992 46 6833102567
+1993 46 7017493760
+1994 46 7015998639
+1995 46 6897957727
+1996 46 6948998143
+1997 46 6510502742
+1998 46 3911656234
+1992 47 6922095842
+1993 47 7061777324
+1994 47 6877252420
+1995 47 6575484550
+1996 47 6517266740
+1997 47 6651228318
+1998 47 3835254989
+1992 48 6818173454
+1993 48 6961952133
+1994 48 7051587760
+1995 48 7329421356
+1996 48 7164243172
+1997 48 7052687209
+1998 48 4132526586
+1992 49 6907633511
+1993 49 6614194460
+1994 49 6773107666
+1995 49 6954065693
+1996 49 6747336514
+1997 49 6947116463
+1998 49 3906763122
+1992 50 7098282117
+1993 50 7263350231
+1994 50 7199754789
+1995 50 7246399314
+1996 50 6860318803
+1997 50 7184653230
+1998 50 4293359981
+1992 51 7474015795
+1993 51 7031859249
+1994 51 6749353264
+1995 51 7395439319
+1996 51 7118371952
+1997 51 7427932834
+1998 51 4080129102
+1992 52 7001985495
+1993 52 6734276751
+1994 52 6965715192
+1995 52 6934765252
+1996 52 6895454124
+1997 52 6802928999
+1998 52 3916065107
+1992 53 6531087764
+1993 53 6258171804
+1994 53 6197787972
+1995 53 6605279401
+1996 53 6722321819
+1997 53 6879971631
+1998 53 3561102555
+1992 54 7041216650
+1993 54 6601732879
+1994 54 6737632272
+1995 54 6483760392
+1996 54 6778740509
+1997 54 6950964366
+1998 54 3960525994
+1992 55 7034325953
+1993 55 7070112383
+1994 55 6835473512
+1995 55 6681873420
+1996 55 6755919599
+1997 55 6883879790
+1998 55 3842444977
+1992 56 6672842875
+1993 56 6362926487
+1994 56 6787572691
+1995 56 6941448166
+1996 56 6349041382
+1997 56 6831022793
+1998 56 3750580610
+1992 57 6762940511
+1993 57 6200194110
+1994 57 6360354225
+1995 57 6799718937
+1996 57 6500504812
+1997 57 6464594869
+1998 57 3690857660
+1992 58 6367358727
+1993 58 6519991362
+1994 58 6228367674
+1995 58 6522760927
+1996 58 6043428578
+1997 58 6386892483
+1998 58 3888948778
+1992 59 6542091138
+1993 59 6669384898
+1994 59 6566921738
+1995 59 6725584633
+1996 59 6678854924
+1997 59 6518974991
+1998 59 3661443815
+1992 60 7397021390
+1993 60 6985315570
+1994 60 7171226221
+1995 60 7409511342
+1996 60 7217054942
+1997 60 7241219598
+1998 60 4134876965
+1992 61 6439487815
+1993 61 6190501096
+1994 61 6658242784
+1995 61 6300444895
+1996 61 6394989839
+1997 61 6372986872
+1998 61 3692782928
+1992 62 7142709582
+1993 62 6575099186
+1994 62 6577906605
+1995 62 6758016505
+1996 62 6713821475
+1997 62 7061699626
+1998 62 3911733232
+1992 63 6684932832
+1993 63 6784872415
+1994 63 6771692541
+1995 63 6832689629
+1996 63 6769695502
+1997 63 6801959247
+1998 63 3916910435
+1992 64 6403427844
+1993 64 6686657397
+1994 64 6560285004
+1995 64 6654877138
+1996 64 6403809726
+1997 64 6364910756
+1998 64 3757788047
+1992 65 6800534485
+1993 65 6932192888
+1994 65 6599703796
+1995 65 6950320978
+1996 65 6745507185
+1997 65 6965554062
+1998 65 3856421228
+1992 66 6608507118
+1993 66 6720022834
+1994 66 7249477139
+1995 66 6982989122
+1996 66 6895681155
+1997 66 7131587724
+1998 66 4050936159
+1992 67 6789994724
+1993 67 7034832635
+1994 67 6533866956
+1995 67 7089400123
+1996 67 6950690822
+1997 67 6872602250
+1998 67 3798832673
+1992 68 6761138392
+1993 68 7117328614
+1994 68 7003067656
+1995 68 6916376148
+1996 68 6810961498
+1997 68 6421432868
+1998 68 4365901362
+1992 69 6333970291
+1993 69 6591672386
+1994 69 6491372066
+1995 69 6759048824
+1996 69 6636341404
+1997 69 6396375726
+1998 69 3755850783
+1992 70 6863351080
+1993 70 7236349480
+1994 70 7065985619
+1995 70 6799040388
+1996 70 7281402064
+1997 70 6735307561
+1998 70 4062655575
+1992 71 6978088606
+1993 71 6615095404
+1994 71 6642491845
+1995 71 7135465638
+1996 71 6904578270
+1997 71 6886861519
+1998 71 3971062487
+1992 72 6077239048
+1993 72 6379459453
+1994 72 6452415472
+1995 72 6170313509
+1996 72 5916688379
+1997 72 5963369350
+1998 72 3683718797
+1992 73 6671048755
+1993 73 6565112476
+1994 73 6641285247
+1995 73 6887663633
+1996 73 6439642020
+1997 73 6675192946
+1998 73 3814007830
+1992 74 6999195521
+1993 74 7007686388
+1994 74 6670519880
+1995 74 6744064671
+1996 74 6614217057
+1997 74 6523268368
+1998 74 4023666133
+1992 75 6627416528
+1993 75 6758016664
+1994 75 6751975322
+1995 75 7047693486
+1996 75 6567430366
+1997 75 6781762704
+1998 75 4063152322
+1992 76 6785625804
+1993 76 6930340135
+1994 76 6382873777
+1995 76 6206415993
+1996 76 6805542040
+1997 76 6422414358
+1998 76 4087738859
+1992 77 6848387744
+1993 77 6623249454
+1994 77 6588036917
+1995 77 6589295276
+1996 77 6603676047
+1997 77 6383121125
+1998 77 4063691471
+1992 78 6240883199
+1993 78 6551226256
+1994 78 6647824791
+1995 78 6494311762
+1996 78 6358269587
+1997 78 6349078074
+1998 78 3890548095
+1992 79 6948601533
+1993 79 7058895576
+1994 79 7280306702
+1995 79 7174749606
+1996 79 7134521672
+1997 79 7009756092
+1998 79 4233289127
+Res Count: 280
+Time Taken Total: 25176.6
+{"query":21,"time_query":25176.7}
+==PROF== Disconnected from process 26756
+[26756] crystal_opt_q21@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.77
+    SM Frequency            cycle/usecond       558.95
+    Elapsed Cycles                  cycle         3729
+    Memory Throughput                   %        15.28
+    DRAM Throughput                     %        15.28
+    Duration                      usecond         6.66
+    L1/TEX Cache Throughput             %         8.21
+    L2 Cache Throughput                 %         4.66
+    SM Active Cycles                cycle      2409.53
+    Compute (SM) Throughput             %         4.62
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.1 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.27
+    Executed Ipc Elapsed  inst/cycle         0.18
+    Issue Slots Busy               %         7.13
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.13
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.46%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        46.63
+    Mem Busy                     %         4.66
+    Max Bandwidth                %        15.28
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        13.50
+    Mem Pipes Busy               %         2.76
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.02
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.98
+    Active Warps Per Scheduler          warp         0.96
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.72%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.2 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.96 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.70
+    Warp Cycles Per Executed Instruction           cycle        14.45
+    Avg. Active Threads Per Warp                                15.91
+    Avg. Not Predicated Off Threads Per Warp                    15.07
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.444%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 15.9 threads being active per cycle. This is further reduced    
+          to 15.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       162.91
+    Executed Instructions                           inst        26066
+    Avg. Issued Instructions Per Scheduler          inst       171.91
+    Issued Instructions                             inst        27506
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.12
+    -------------------------------- --------------- ---------------
+
+    OPT   If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have more than the 
+          achieved 1 blocks per multiprocessor. This way, blocks that aren't waiting for __syncthreads() can keep the   
+          hardware busy.                                                                                                
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.27
+    Achieved Active Warps Per SM           warp         3.93
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 84.72%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.3%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       2.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.955%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.63
+    SM Frequency            cycle/usecond       541.76
+    Elapsed Cycles                  cycle        32251
+    Memory Throughput                   %        79.24
+    DRAM Throughput                     %        79.24
+    Duration                      usecond        59.52
+    L1/TEX Cache Throughput             %        27.37
+    L2 Cache Throughput                 %        23.98
+    SM Active Cycles                cycle     30162.60
+    Compute (SM) Throughput             %        19.15
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.81
+    Executed Ipc Elapsed  inst/cycle         0.76
+    Issue Slots Busy               %        20.47
+    Issued Ipc Active     inst/cycle         0.82
+    SM Busy                        %        20.47
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.88%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       234.69
+    Mem Busy                     %        23.98
+    Max Bandwidth                %        79.24
+    L1/TEX Hit Rate              %         0.02
+    L2 Hit Rate                  %         8.77
+    Mem Pipes Busy               %        18.25
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.385%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        18.79
+    Issued Warp Per Scheduler                        0.19
+    No Eligible                            %        81.21
+    Active Warps Per Scheduler          warp         6.74
+    Eligible Warps Per Scheduler        warp         0.25
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 20.76%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 5.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.74 active warps per scheduler, but only an average of 0.25 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        35.89
+    Warp Cycles Per Executed Instruction           cycle        36.17
+    Avg. Active Threads Per Warp                                15.82
+    Avg. Not Predicated Off Threads Per Warp                    15.65
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 20.76%                                                                                          
+          On average, each warp of this kernel spends 19.9 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 55.4% of the total average of 35.9 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 9.781%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 15.8 threads being active per cycle. This is further reduced    
+          to 15.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      6125.61
+    Executed Instructions                           inst       980097
+    Avg. Issued Instructions Per Scheduler          inst      6173.90
+    Issued Instructions                             inst       987824
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                4.88
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.52
+    Achieved Active Warps Per SM           warp        29.29
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.12
+    Branch Instructions              inst       118258
+    Branch Efficiency                   %        72.76
+    Avg. Divergent Branches                      85.17
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.108%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 23924 excessive sectors (7% of the total  
+          360344 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.78
+    SM Frequency            cycle/usecond       557.47
+    Elapsed Cycles                  cycle         3213
+    Memory Throughput                   %         3.79
+    DRAM Throughput                     %         3.79
+    Duration                      usecond         5.76
+    L1/TEX Cache Throughput             %        30.38
+    L2 Cache Throughput                 %         2.12
+    SM Active Cycles                cycle       251.78
+    Compute (SM) Throughput             %         0.57
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.28
+    Executed Ipc Elapsed  inst/cycle         0.02
+    Issue Slots Busy               %         7.31
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.31
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.02%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        11.59
+    Mem Busy                     %         1.73
+    Max Bandwidth                %         3.79
+    L1/TEX Hit Rate              %         5.18
+    L2 Hit Rate                  %        49.69
+    Mem Pipes Busy               %         0.53
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.05726%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1468%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.7 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.12
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.88
+    Active Warps Per Scheduler          warp         0.96
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.88%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.1 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.96 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.45
+    Warp Cycles Per Executed Instruction           cycle        14.24
+    Avg. Active Threads Per Warp                                31.92
+    Avg. Not Predicated Off Threads Per Warp                    28.58
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        17.39
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst        18.41
+    Issued Instructions                             inst         2946
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 87.5%                                                                                           
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.14
+    Achieved Active Warps Per SM           warp         3.89
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.86%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.1%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.954%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.33
+    Elapsed Cycles                  cycle      3764836
+    Memory Throughput                   %        88.23
+    DRAM Throughput                     %        88.23
+    Duration                      msecond         6.44
+    L1/TEX Cache Throughput             %        74.44
+    L2 Cache Throughput                 %        39.50
+    SM Active Cycles                cycle   3761787.77
+    Compute (SM) Throughput             %        27.97
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.12
+    Executed Ipc Elapsed  inst/cycle         1.12
+    Issue Slots Busy               %        27.99
+    Issued Ipc Active     inst/cycle         1.12
+    SM Busy                        %        27.99
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 80.46%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       281.79
+    Mem Busy                     %        39.50
+    Max Bandwidth                %        88.23
+    L1/TEX Hit Rate              %        22.99
+    L2 Hit Rate                  %        65.86
+    Mem Pipes Busy               %        16.12
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.632%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.9 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.3 sectors per request, or 9.3*32 = 298.7 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.9 byte accesses would result in 4.9*32 = 158.1 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 27.29%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.2626%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 30.61%                                                                                          
+          The memory access pattern for loads from device memory causes 34,622,828 sectors to be read from DRAM, which  
+          is 1.5x of the 23,741,640 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        27.96
+    Issued Warp Per Scheduler                        0.28
+    No Eligible                            %        72.04
+    Active Warps Per Scheduler          warp         6.86
+    Eligible Warps Per Scheduler        warp         0.39
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 11.77%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 3.6 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.86 active warps per scheduler, but only an average of 0.39 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        24.52
+    Warp Cycles Per Executed Instruction           cycle        24.52
+    Avg. Active Threads Per Warp                                14.21
+    Avg. Not Predicated Off Threads Per Warp                    12.40
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 11.77%                                                                                          
+          On average, each warp of this kernel spends 15.4 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 62.8% of the total average of 24.5 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 17.13%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 14.2 threads being active per cycle. This is further reduced    
+          to 12.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1053004.47
+    Executed Instructions                           inst    168480715
+    Avg. Issued Instructions Per Scheduler          inst   1053060.79
+    Issued Instructions                             inst    168489727
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                              366.13
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        85.95
+    Achieved Active Warps Per SM           warp        27.50
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 11.77%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (85.9%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst     16750526
+    Branch Efficiency                   %        66.51
+    Avg. Divergent Branches                   14227.90
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 73.26%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 65007931 excessive sectors (73% of the    
+          total 88620164 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal-opt/crystal_opt_q31_sf10.txt b/results/T4/crystal-opt/crystal_opt_q31_sf10.txt
new file mode 100644
index 0000000..f95fb22
--- /dev/null
+++ b/results/T4/crystal-opt/crystal_opt_q31_sf10.txt
@@ -0,0 +1,913 @@
+==PROF== Connected to process 27264 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q31)
+Using device 0: Tesla T4 (PTX version 750, SM750, 40 SMs, 14802 free / 14929 total MB physmem, 320.064 GB/s @ 5001000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+Result:
+1992 8 8 53664098547
+1993 8 8 53051563726
+1994 8 8 53551966681
+1995 8 8 53338395993
+1996 8 8 53781237952
+1997 8 8 53558132271
+1992 9 8 55867859815
+1993 9 8 55345162638
+1994 9 8 55589883121
+1995 9 8 54871630692
+1996 9 8 55620205618
+1997 9 8 54852742519
+1992 12 8 52867359425
+1993 12 8 53435367523
+1994 12 8 52283824959
+1995 12 8 52956472988
+1996 12 8 52948768521
+1997 12 8 52962165616
+1992 18 8 53592522758
+1993 18 8 52996000810
+1994 18 8 52962120320
+1995 18 8 53924104344
+1996 18 8 53634737856
+1997 18 8 54307983851
+1992 21 8 53816502394
+1993 21 8 54349264842
+1994 21 8 54119359035
+1995 21 8 53961984627
+1996 21 8 54294333705
+1997 21 8 53703384515
+1992 8 9 55444214883
+1993 8 9 55740793389
+1994 8 9 55137400588
+1995 8 9 55784172640
+1996 8 9 56378453713
+1997 8 9 55399009353
+1992 9 9 57271740148
+1993 9 9 58216495642
+1994 9 9 57507217082
+1995 9 9 57860170696
+1996 9 9 58662284841
+1997 9 9 56940173344
+1992 12 9 55432874858
+1993 12 9 55398755151
+1994 12 9 55206960389
+1995 12 9 55581754250
+1996 12 9 55487324569
+1997 12 9 53582297974
+1992 18 9 56370007920
+1993 18 9 56166403334
+1994 18 9 55432079732
+1995 18 9 55973419507
+1996 18 9 56254723722
+1997 18 9 55830709236
+1992 21 9 56359335068
+1993 21 9 56885558074
+1994 21 9 56507097670
+1995 21 9 57465742525
+1996 21 9 56177166557
+1997 21 9 56333135444
+1992 8 12 51295873912
+1993 8 12 52384079867
+1994 8 12 52254716872
+1995 8 12 51669051730
+1996 8 12 52670597733
+1997 8 12 53782563068
+1992 9 12 54255769995
+1993 9 12 53477912258
+1994 9 12 53868848846
+1995 9 12 54310027205
+1996 9 12 55409865859
+1997 9 12 54099065304
+1992 12 12 52584065821
+1993 12 12 52637339531
+1994 12 12 50154194273
+1995 12 12 51904425056
+1996 12 12 52493537142
+1997 12 12 50634790895
+1992 18 12 52896145835
+1993 18 12 53112435531
+1994 18 12 52021625515
+1995 18 12 52031180987
+1996 18 12 53022298730
+1997 18 12 53294469049
+1992 21 12 53284643553
+1993 21 12 53900783410
+1994 21 12 53648011682
+1995 21 12 53376554374
+1996 21 12 52174060166
+1997 21 12 52785883863
+1992 8 18 51873441494
+1993 8 18 51961213538
+1994 8 18 52868608376
+1995 8 18 52738284867
+1996 8 18 51678789303
+1997 8 18 51787339279
+1992 9 18 53893325353
+1993 9 18 54178339670
+1994 9 18 54059232642
+1995 9 18 53920766480
+1996 9 18 54128092218
+1997 9 18 54349079982
+1992 12 18 51449505308
+1993 12 18 51384752707
+1994 12 18 52195482938
+1995 12 18 51205040497
+1996 12 18 51165908280
+1997 12 18 52167794260
+1992 18 18 53246367726
+1993 18 18 52211194809
+1994 18 18 52388807873
+1995 18 18 52459889035
+1996 18 18 53737304610
+1997 18 18 52772297391
+1992 21 18 53752784633
+1993 21 18 53723459056
+1994 21 18 52734575706
+1995 21 18 52810670641
+1996 21 18 53606892262
+1997 21 18 52841307001
+1992 8 21 49589186930
+1993 8 21 50874540178
+1994 8 21 50484052905
+1995 8 21 50476123376
+1996 8 21 51102099810
+1997 8 21 51376581082
+1992 9 21 51183086614
+1993 9 21 51849557513
+1994 9 21 51912335762
+1995 9 21 51737313715
+1996 9 21 52987320706
+1997 9 21 51870436294
+1992 12 21 49502367103
+1993 12 21 49962826767
+1994 12 21 50112754286
+1995 12 21 48732674673
+1996 12 21 50123146827
+1997 12 21 49094088315
+1992 18 21 50957655153
+1993 18 21 50627753769
+1994 18 21 50537890156
+1995 18 21 50265160335
+1996 18 21 50774431442
+1997 18 21 51103107061
+1992 21 21 49934446612
+1993 21 21 51562382531
+1994 21 21 50180119681
+1995 21 21 51221558310
+1996 21 21 50423672514
+1997 21 21 50461561884
+Res Count: 150
+Time Taken Total: 26432.5
+{"query":31,"time_query":26432.4}
+==PROF== Disconnected from process 27264
+[27264] crystal_opt_q31@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.77
+    SM Frequency            cycle/usecond       554.13
+    Elapsed Cycles                  cycle         3977
+    Memory Throughput                   %        21.87
+    DRAM Throughput                     %        21.87
+    Duration                      usecond         7.17
+    L1/TEX Cache Throughput             %        15.75
+    L2 Cache Throughput                 %         7.76
+    SM Active Cycles                cycle      2634.85
+    Compute (SM) Throughput             %         4.91
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.1 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.28
+    Executed Ipc Elapsed  inst/cycle         0.19
+    Issue Slots Busy               %         7.40
+    Issued Ipc Active     inst/cycle         0.30
+    SM Busy                        %         7.40
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.39%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        66.72
+    Mem Busy                     %         7.76
+    Max Bandwidth                %        21.87
+    L1/TEX Hit Rate              %         1.12
+    L2 Hit Rate                  %        28.09
+    Mem Pipes Busy               %         4.18
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.08916%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 153.8 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.6691%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.17
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.83
+    Active Warps Per Scheduler          warp         0.95
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 78.13%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 13.9 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.95 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.22
+    Warp Cycles Per Executed Instruction           cycle        13.86
+    Avg. Active Threads Per Warp                                16.44
+    Avg. Not Predicated Off Threads Per Warp                    15.71
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.498%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.4 threads being active per cycle. This is further reduced    
+          to 15.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       185.97
+    Executed Instructions                           inst        29755
+    Avg. Issued Instructions Per Scheduler          inst       194.97
+    Issued Instructions                             inst        31195
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.12
+    -------------------------------- --------------- ---------------
+
+    OPT   If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have more than the 
+          achieved 1 blocks per multiprocessor. This way, blocks that aren't waiting for __syncthreads() can keep the   
+          hardware busy.                                                                                                
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.20
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 78.13%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         3201
+    Branch Efficiency                   %        63.81
+    Avg. Divergent Branches                       2.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 15.09%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 4534 excessive sectors (34% of the total  
+          13506 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.     
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.47
+    SM Frequency            cycle/usecond       524.42
+    Elapsed Cycles                  cycle        18096
+    Memory Throughput                   %        75.16
+    DRAM Throughput                     %        75.16
+    Duration                      usecond        34.50
+    L1/TEX Cache Throughput             %        53.59
+    L2 Cache Throughput                 %        24.16
+    SM Active Cycles                cycle     16058.20
+    Compute (SM) Throughput             %        15.49
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.69
+    Executed Ipc Elapsed  inst/cycle         0.61
+    Issue Slots Busy               %        17.45
+    Issued Ipc Active     inst/cycle         0.70
+    SM Busy                        %        17.45
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 88.69%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       215.07
+    Mem Busy                     %        24.16
+    Max Bandwidth                %        75.16
+    L1/TEX Hit Rate              %         0.29
+    L2 Hit Rate                  %        23.64
+    Mem Pipes Busy               %        13.60
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.3062%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 155.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.3%                                                                                            
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.75
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.25
+    Active Warps Per Scheduler          warp         5.91
+    Eligible Warps Per Scheduler        warp         0.22
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 24.84%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          5.91 active warps per scheduler, but only an average of 0.22 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        37.55
+    Warp Cycles Per Executed Instruction           cycle        38.20
+    Avg. Active Threads Per Warp                                16.28
+    Avg. Not Predicated Off Threads Per Warp                    15.59
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 24.84%                                                                                          
+          On average, each warp of this kernel spends 16.0 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 42.7% of the total average of 37.5 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.941%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.3 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2754.01
+    Executed Instructions                           inst       440642
+    Avg. Issued Instructions Per Scheduler          inst      2802.07
+    Issued Instructions                             inst       448331
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                1.83
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        83.48
+    Achieved Active Warps Per SM           warp        26.71
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 16.52%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (83.5%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46902
+    Branch Efficiency                   %        62.57
+    Avg. Divergent Branches                      43.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 29.36%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68642 excessive sectors (34% of the total 
+          203290 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.84
+    SM Frequency            cycle/usecond       561.34
+    Elapsed Cycles                  cycle         3666
+    Memory Throughput                   %         3.09
+    DRAM Throughput                     %         3.09
+    Duration                      usecond         6.53
+    L1/TEX Cache Throughput             %        23.48
+    L2 Cache Throughput                 %         1.66
+    SM Active Cycles                cycle       288.23
+    Compute (SM) Throughput             %         0.66
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.32
+    Executed Ipc Elapsed  inst/cycle         0.03
+    Issue Slots Busy               %         8.43
+    Issued Ipc Active     inst/cycle         0.34
+    SM Busy                        %         8.43
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 94.59%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         9.57
+    Mem Busy                     %         1.40
+    Max Bandwidth                %         3.09
+    L1/TEX Hit Rate              %         2.07
+    L2 Hit Rate                  %        50.25
+    Mem Pipes Busy               %         0.43
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.0448%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.0 sectors per request, or 9.0*32 = 289.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1329%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         8.13
+    Issued Warp Per Scheduler                        0.08
+    No Eligible                            %        91.87
+    Active Warps Per Scheduler          warp         0.94
+    Eligible Warps Per Scheduler        warp         0.08
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.87%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 12.3 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.94 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        11.55
+    Warp Cycles Per Executed Instruction           cycle        12.11
+    Avg. Active Threads Per Warp                                31.82
+    Avg. Not Predicated Off Threads Per Warp                    28.46
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        23.18
+    Executed Instructions                           inst         3709
+    Avg. Issued Instructions Per Scheduler          inst        24.31
+    Issued Instructions                             inst         3889
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 87.5%                                                                                           
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.20
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.8%                                                                                           
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst          373
+    Branch Efficiency                   %        97.40
+    Avg. Divergent Branches                       0.03
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.268%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 700 excessive sectors (37% of the total   
+          1888 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.68
+    Elapsed Cycles                  cycle      4127440
+    Memory Throughput                   %        61.42
+    DRAM Throughput                     %        61.42
+    Duration                      msecond         7.06
+    L1/TEX Cache Throughput             %        89.48
+    L2 Cache Throughput                 %        43.87
+    SM Active Cycles                cycle   4126563.20
+    Compute (SM) Throughput             %        34.01
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.36
+    Executed Ipc Elapsed  inst/cycle         1.36
+    Issue Slots Busy               %        34.02
+    Issued Ipc Active     inst/cycle         1.36
+    SM Busy                        %        34.02
+    -------------------- ----------- ------------
+
+    INF   FMA is the highest-utilized pipeline (24.2%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD)    
+          operations. It is well-utilized, but should not be a bottleneck.                                              
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       196.26
+    Mem Busy                     %        43.87
+    Max Bandwidth                %        61.42
+    L1/TEX Hit Rate              %        13.58
+    L2 Hit Rate                  %        75.72
+    Mem Pipes Busy               %        20.78
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.9427%                                                                                         
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          5.4 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 7.5 sectors per request, or 7.5*32 = 240.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 5.4 byte accesses would result in 5.4*32 = 174.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 28.87%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 1.372%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.1 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 16.01%                                                                                          
+          The memory access pattern for loads from device memory causes 27,495,148 sectors to be read from DRAM, which  
+          is 1.3x of the 20,561,003 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        34.02
+    Issued Warp Per Scheduler                        0.34
+    No Eligible                            %        65.98
+    Active Warps Per Scheduler          warp         7.15
+    Eligible Warps Per Scheduler        warp         0.53
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 38.58%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 2.9 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          7.15 active warps per scheduler, but only an average of 0.53 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        21.01
+    Warp Cycles Per Executed Instruction           cycle        21.01
+    Avg. Active Threads Per Warp                                12.66
+    Avg. Not Predicated Off Threads Per Warp                    11.12
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 38.58%                                                                                          
+          On average, each warp of this kernel spends 10.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 48.0% of the total average of 21.0 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 22.19%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 12.7 threads being active per cycle. This is further reduced    
+          to 11.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1403632.31
+    Executed Instructions                           inst    224581169
+    Avg. Issued Instructions Per Scheduler          inst   1403684.85
+    Issued Instructions                             inst    224589576
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              31
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                              366.13
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        89.53
+    Achieved Active Warps Per SM           warp        28.65
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 10.47%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (89.5%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst     21548967
+    Branch Efficiency                   %        55.37
+    Avg. Divergent Branches                   23206.33
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 61.78%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 58809051 excessive sectors (62% of the    
+          total 95109570 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal-opt/crystal_opt_q41_sf10.txt b/results/T4/crystal-opt/crystal_opt_q41_sf10.txt
new file mode 100644
index 0000000..7867ea6
--- /dev/null
+++ b/results/T4/crystal-opt/crystal_opt_q41_sf10.txt
@@ -0,0 +1,952 @@
+==PROF== Connected to process 27721 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q41)
+Using device 0: Tesla T4 (PTX version 750, SM750, 40 SMs, 14802 free / 14929 total MB physmem, 320.064 GB/s @ 5001000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 3: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 4: 0%....50%....100% - 35 passes
+Result:
+1992 1 103225040658
+1993 1 105193302842
+1994 1 103837804124
+1995 1 103659981621
+1996 1 103616722233
+1997 1 103157005314
+1998 1 61159340206
+1992 2 106678259181
+1993 2 105849020253
+1994 2 106216978529
+1995 2 107035371791
+1996 2 105292362331
+1997 2 105381211263
+1998 2 61616122837
+1992 3 106953585129
+1993 3 106242432020
+1994 3 105405953212
+1995 3 106496045663
+1996 3 106452120723
+1997 3 106618275297
+1998 3 61766210322
+1992 17 103623138817
+1993 17 104974876956
+1994 17 103731557899
+1995 17 103730419480
+1996 17 104874194133
+1997 17 102847514868
+1998 17 61002354487
+1992 24 106223564390
+1993 24 105649036141
+1994 24 106076726307
+1995 24 105177111217
+1996 24 103976579696
+1997 24 104638539353
+1998 24 60962148771
+Res Count: 35
+Time Taken Total: 31535.2
+{"query":41,"time_query":31535.2}
+==PROF== Disconnected from process 27721
+[27721] crystal_opt_q41@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.74
+    SM Frequency            cycle/usecond       553.28
+    Elapsed Cycles                  cycle         3656
+    Memory Throughput                   %        15.58
+    DRAM Throughput                     %        15.58
+    Duration                      usecond         6.59
+    L1/TEX Cache Throughput             %         8.38
+    L2 Cache Throughput                 %         4.73
+    SM Active Cycles                cycle      2399.03
+    Compute (SM) Throughput             %         4.71
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.1 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.27
+    Executed Ipc Elapsed  inst/cycle         0.18
+    Issue Slots Busy               %         7.17
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.17
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.44%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        47.23
+    Mem Busy                     %         4.73
+    Max Bandwidth                %        15.58
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        13.52
+    Mem Pipes Busy               %         2.82
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.06
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.94
+    Active Warps Per Scheduler          warp         0.95
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.42%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.2 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.95 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.43
+    Warp Cycles Per Executed Instruction           cycle        14.17
+    Avg. Active Threads Per Warp                                15.91
+    Avg. Not Predicated Off Threads Per Warp                    15.07
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.493%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 15.9 threads being active per cycle. This is further reduced    
+          to 15.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       162.91
+    Executed Instructions                           inst        26066
+    Avg. Issued Instructions Per Scheduler          inst       171.91
+    Issued Instructions                             inst        27506
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.12
+    -------------------------------- --------------- ---------------
+
+    OPT   If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have more than the 
+          achieved 1 blocks per multiprocessor. This way, blocks that aren't waiting for __syncthreads() can keep the   
+          hardware busy.                                                                                                
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.25
+    Achieved Active Warps Per SM           warp         3.92
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 84.42%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       2.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.811%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.51
+    SM Frequency            cycle/usecond       529.43
+    Elapsed Cycles                  cycle        17778
+    Memory Throughput                   %        76.24
+    DRAM Throughput                     %        76.24
+    Duration                      usecond        33.57
+    L1/TEX Cache Throughput             %        54.22
+    L2 Cache Throughput                 %        24.50
+    SM Active Cycles                cycle     16029.67
+    Compute (SM) Throughput             %        15.76
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.69
+    Executed Ipc Elapsed  inst/cycle         0.62
+    Issue Slots Busy               %        17.48
+    Issued Ipc Active     inst/cycle         0.70
+    SM Busy                        %        17.48
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 88.68%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       220.28
+    Mem Busy                     %        24.50
+    Max Bandwidth                %        76.24
+    L1/TEX Hit Rate              %         0.24
+    L2 Hit Rate                  %        23.57
+    Mem Pipes Busy               %        13.84
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.3007%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.324%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.75
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.25
+    Active Warps Per Scheduler          warp         6.03
+    Eligible Warps Per Scheduler        warp         0.22
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 23.76%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.03 active warps per scheduler, but only an average of 0.22 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        38.27
+    Warp Cycles Per Executed Instruction           cycle        38.94
+    Avg. Active Threads Per Warp                                16.24
+    Avg. Not Predicated Off Threads Per Warp                    15.56
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 23.76%                                                                                          
+          On average, each warp of this kernel spends 15.9 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 41.5% of the total average of 38.3 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 8.099%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.2 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2753.09
+    Executed Instructions                           inst       440494
+    Avg. Issued Instructions Per Scheduler          inst      2801.41
+    Issued Instructions                             inst       448225
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                1.83
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        83.41
+    Achieved Active Warps Per SM           warp        26.69
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 16.59%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (83.4%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46897
+    Branch Efficiency                   %        62.60
+    Avg. Divergent Branches                      43.90
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 30.36%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68196 excessive sectors (34% of the total 
+          202634 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.60
+    SM Frequency            cycle/usecond       538.89
+    Elapsed Cycles                  cycle        28184
+    Memory Throughput                   %        84.58
+    DRAM Throughput                     %        84.58
+    Duration                      usecond        52.29
+    L1/TEX Cache Throughput             %        53.87
+    L2 Cache Throughput                 %        24.03
+    SM Active Cycles                cycle     26550.40
+    Compute (SM) Throughput             %        25.27
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.07
+    Executed Ipc Elapsed  inst/cycle         1.00
+    Issue Slots Busy               %        26.82
+    Issued Ipc Active     inst/cycle         1.07
+    SM Busy                        %        26.82
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 82.05%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       249.14
+    Mem Busy                     %        23.48
+    Max Bandwidth                %        84.58
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         5.52
+    Mem Pipes Busy               %        14.42
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        24.61
+    Issued Warp Per Scheduler                        0.25
+    No Eligible                            %        75.39
+    Active Warps Per Scheduler          warp         6.67
+    Eligible Warps Per Scheduler        warp         0.35
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 15.42%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.1 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.67 active warps per scheduler, but only an average of 0.35 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        27.12
+    Warp Cycles Per Executed Instruction           cycle        27.30
+    Avg. Active Threads Per Warp                                20.65
+    Avg. Not Predicated Off Threads Per Warp                    19.23
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 15.42%                                                                                          
+          On average, each warp of this kernel spends 12.9 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 47.7% of the total average of 27.1 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 10.08%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.6 threads being active per cycle. This is further reduced    
+          to 19.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      7071.98
+    Executed Instructions                           inst      1131516
+    Avg. Issued Instructions Per Scheduler          inst      7119.83
+    Issued Instructions                             inst      1139173
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                4.88
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.76
+    Achieved Active Warps Per SM           warp        29.36
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst       125048
+    Branch Efficiency                   %        62.52
+    Avg. Divergent Branches                     117.20
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 17.37%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 57145 excessive sectors (19% of the total 
+          308036 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.74
+    SM Frequency            cycle/usecond       552.80
+    Elapsed Cycles                  cycle         3275
+    Memory Throughput                   %         3.72
+    DRAM Throughput                     %         3.72
+    Duration                      usecond         5.92
+    L1/TEX Cache Throughput             %        30.34
+    L2 Cache Throughput                 %         2.11
+    SM Active Cycles                cycle       253.03
+    Compute (SM) Throughput             %         0.56
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.27
+    Executed Ipc Elapsed  inst/cycle         0.02
+    Issue Slots Busy               %         7.28
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.28
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.04%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        11.26
+    Mem Busy                     %         1.69
+    Max Bandwidth                %         3.72
+    L1/TEX Hit Rate              %         4.41
+    L2 Hit Rate                  %        50.27
+    Mem Pipes Busy               %         0.52
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.05564%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1363%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.8 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.91
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        93.09
+    Active Warps Per Scheduler          warp         0.91
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.09%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.5 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.91 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.12
+    Warp Cycles Per Executed Instruction           cycle        13.89
+    Avg. Active Threads Per Warp                                31.92
+    Avg. Not Predicated Off Threads Per Warp                    28.58
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        17.39
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst        18.41
+    Issued Instructions                             inst         2946
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 87.5%                                                                                           
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.08
+    Achieved Active Warps Per SM           warp         3.87
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.92%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.1%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.688%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.66
+    Elapsed Cycles                  cycle      4467432
+    Memory Throughput                   %        76.73
+    DRAM Throughput                     %        76.73
+    Duration                      msecond         7.64
+    L1/TEX Cache Throughput             %        64.50
+    L2 Cache Throughput                 %        34.53
+    SM Active Cycles                cycle   4464809.08
+    Compute (SM) Throughput             %        34.02
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.36
+    Executed Ipc Elapsed  inst/cycle         1.36
+    Issue Slots Busy               %        34.04
+    Issued Ipc Active     inst/cycle         1.36
+    SM Busy                        %        34.04
+    -------------------- ----------- ------------
+
+    INF   FMA is the highest-utilized pipeline (23.1%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD)    
+          operations. It is well-utilized, but should not be a bottleneck.                                              
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       245.21
+    Mem Busy                     %        34.53
+    Max Bandwidth                %        76.73
+    L1/TEX Hit Rate              %        23.23
+    L2 Hit Rate                  %        65.63
+    Mem Pipes Busy               %        20.69
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.899%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.7 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 6.7 sectors per request, or 6.7*32 = 214.5 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.7 byte accesses would result in 4.7*32 = 151.2 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 23.47%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.3515%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.1 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 27.11%                                                                                          
+          The memory access pattern for loads from device memory causes 35,928,800 sectors to be read from DRAM, which  
+          is 1.4x of the 24,796,646 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        34.03
+    Issued Warp Per Scheduler                        0.34
+    No Eligible                            %        65.97
+    Active Warps Per Scheduler          warp         6.93
+    Eligible Warps Per Scheduler        warp         0.50
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 23.27%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 2.9 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.93 active warps per scheduler, but only an average of 0.50 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        20.37
+    Warp Cycles Per Executed Instruction           cycle        20.37
+    Avg. Active Threads Per Warp                                10.80
+    Avg. Not Predicated Off Threads Per Warp                     9.44
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 23.27%                                                                                          
+          On average, each warp of this kernel spends 11.8 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 57.9% of the total average of 20.4 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 23.99%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 10.8 threads being active per cycle. This is further reduced    
+          to 9.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.       
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1519846.33
+    Executed Instructions                           inst    243175413
+    Avg. Issued Instructions Per Scheduler          inst   1519905.49
+    Issued Instructions                             inst    243184879
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              29
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                              366.13
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        86.90
+    Achieved Active Warps Per SM           warp        27.81
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 13.1%                                                                                           
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (86.9%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst     26599495
+    Branch Efficiency                   %        64.43
+    Avg. Divergent Branches                   24123.38
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 69.31%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 63822318 excessive sectors (69% of the    
+          total 92036223 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal/crystal_q21.txt b/results/T4/crystal/crystal_q21.txt
new file mode 100644
index 0000000..62fde9f
--- /dev/null
+++ b/results/T4/crystal/crystal_q21.txt
@@ -0,0 +1,1008 @@
+==PROF== Connected to process 24377 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal/src/crystal_q21)
+Using device 0: Tesla T4 (PTX version 750, SM750, 40 SMs, 14802 free / 14929 total MB physmem, 320.064 GB/s @ 5001000 kHz mem clock, ECC on)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+1992 40 6574868694
+1993 40 6952043914
+1994 40 6525239576
+1995 40 6764559245
+1996 40 6725548424
+1997 40 6596102991
+1998 40 3988851825
+1992 41 7047701749
+1993 41 6909841940
+1994 41 6978800980
+1995 41 7036474627
+1996 41 7233045193
+1997 41 6938053628
+1998 41 4065391978
+1992 42 6450484539
+1993 42 6886094182
+1994 42 6852294265
+1995 42 6749813918
+1996 42 6568551778
+1997 42 6845017761
+1998 42 3773836113
+1992 43 6918393482
+1993 43 6621428714
+1994 43 7068738463
+1995 43 6820930145
+1996 43 6762634261
+1997 43 6849537060
+1998 43 3882704011
+1992 44 6343659176
+1993 44 6094791212
+1994 44 6661136530
+1995 44 6085276694
+1996 44 6176324016
+1997 44 6315911460
+1998 44 3925731952
+1992 45 6499025385
+1993 45 6779833973
+1994 45 6435942251
+1995 45 6738626764
+1996 45 6763207154
+1997 45 6889101910
+1998 45 3879170338
+1992 46 6833102567
+1993 46 7017493760
+1994 46 7015998639
+1995 46 6897957727
+1996 46 6948998143
+1997 46 6510502742
+1998 46 3911656234
+1992 47 6922095842
+1993 47 7061777324
+1994 47 6877252420
+1995 47 6575484550
+1996 47 6517266740
+1997 47 6651228318
+1998 47 3835254989
+1992 48 6818173454
+1993 48 6961952133
+1994 48 7051587760
+1995 48 7329421356
+1996 48 7164243172
+1997 48 7052687209
+1998 48 4132526586
+1992 49 6907633511
+1993 49 6614194460
+1994 49 6773107666
+1995 49 6954065693
+1996 49 6747336514
+1997 49 6947116463
+1998 49 3906763122
+1992 50 7098282117
+1993 50 7263350231
+1994 50 7199754789
+1995 50 7246399314
+1996 50 6860318803
+1997 50 7184653230
+1998 50 4293359981
+1992 51 7474015795
+1993 51 7031859249
+1994 51 6749353264
+1995 51 7395439319
+1996 51 7118371952
+1997 51 7427932834
+1998 51 4080129102
+1992 52 7001985495
+1993 52 6734276751
+1994 52 6965715192
+1995 52 6934765252
+1996 52 6895454124
+1997 52 6802928999
+1998 52 3916065107
+1992 53 6531087764
+1993 53 6258171804
+1994 53 6197787972
+1995 53 6605279401
+1996 53 6722321819
+1997 53 6879971631
+1998 53 3561102555
+1992 54 7041216650
+1993 54 6601732879
+1994 54 6737632272
+1995 54 6483760392
+1996 54 6778740509
+1997 54 6950964366
+1998 54 3960525994
+1992 55 7034325953
+1993 55 7070112383
+1994 55 6835473512
+1995 55 6681873420
+1996 55 6755919599
+1997 55 6883879790
+1998 55 3842444977
+1992 56 6672842875
+1993 56 6362926487
+1994 56 6787572691
+1995 56 6941448166
+1996 56 6349041382
+1997 56 6831022793
+1998 56 3750580610
+1992 57 6762940511
+1993 57 6200194110
+1994 57 6360354225
+1995 57 6799718937
+1996 57 6500504812
+1997 57 6464594869
+1998 57 3690857660
+1992 58 6367358727
+1993 58 6519991362
+1994 58 6228367674
+1995 58 6522760927
+1996 58 6043428578
+1997 58 6386892483
+1998 58 3888948778
+1992 59 6542091138
+1993 59 6669384898
+1994 59 6566921738
+1995 59 6725584633
+1996 59 6678854924
+1997 59 6518974991
+1998 59 3661443815
+1992 60 7397021390
+1993 60 6985315570
+1994 60 7171226221
+1995 60 7409511342
+1996 60 7217054942
+1997 60 7241219598
+1998 60 4134876965
+1992 61 6439487815
+1993 61 6190501096
+1994 61 6658242784
+1995 61 6300444895
+1996 61 6394989839
+1997 61 6372986872
+1998 61 3692782928
+1992 62 7142709582
+1993 62 6575099186
+1994 62 6577906605
+1995 62 6758016505
+1996 62 6713821475
+1997 62 7061699626
+1998 62 3911733232
+1992 63 6684932832
+1993 63 6784872415
+1994 63 6771692541
+1995 63 6832689629
+1996 63 6769695502
+1997 63 6801959247
+1998 63 3916910435
+1992 64 6403427844
+1993 64 6686657397
+1994 64 6560285004
+1995 64 6654877138
+1996 64 6403809726
+1997 64 6364910756
+1998 64 3757788047
+1992 65 6800534485
+1993 65 6932192888
+1994 65 6599703796
+1995 65 6950320978
+1996 65 6745507185
+1997 65 6965554062
+1998 65 3856421228
+1992 66 6608507118
+1993 66 6720022834
+1994 66 7249477139
+1995 66 6982989122
+1996 66 6895681155
+1997 66 7131587724
+1998 66 4050936159
+1992 67 6789994724
+1993 67 7034832635
+1994 67 6533866956
+1995 67 7089400123
+1996 67 6950690822
+1997 67 6872602250
+1998 67 3798832673
+1992 68 6761138392
+1993 68 7117328614
+1994 68 7003067656
+1995 68 6916376148
+1996 68 6810961498
+1997 68 6421432868
+1998 68 4365901362
+1992 69 6333970291
+1993 69 6591672386
+1994 69 6491372066
+1995 69 6759048824
+1996 69 6636341404
+1997 69 6396375726
+1998 69 3755850783
+1992 70 6863351080
+1993 70 7236349480
+1994 70 7065985619
+1995 70 6799040388
+1996 70 7281402064
+1997 70 6735307561
+1998 70 4062655575
+1992 71 6978088606
+1993 71 6615095404
+1994 71 6642491845
+1995 71 7135465638
+1996 71 6904578270
+1997 71 6886861519
+1998 71 3971062487
+1992 72 6077239048
+1993 72 6379459453
+1994 72 6452415472
+1995 72 6170313509
+1996 72 5916688379
+1997 72 5963369350
+1998 72 3683718797
+1992 73 6671048755
+1993 73 6565112476
+1994 73 6641285247
+1995 73 6887663633
+1996 73 6439642020
+1997 73 6675192946
+1998 73 3814007830
+1992 74 6999195521
+1993 74 7007686388
+1994 74 6670519880
+1995 74 6744064671
+1996 74 6614217057
+1997 74 6523268368
+1998 74 4023666133
+1992 75 6627416528
+1993 75 6758016664
+1994 75 6751975322
+1995 75 7047693486
+1996 75 6567430366
+1997 75 6781762704
+1998 75 4063152322
+1992 76 6785625804
+1993 76 6930340135
+1994 76 6382873777
+1995 76 6206415993
+1996 76 6805542040
+1997 76 6422414358
+1998 76 4087738859
+1992 77 6848387744
+1993 77 6623249454
+1994 77 6588036917
+1995 77 6589295276
+1996 77 6603676047
+1997 77 6383121125
+1998 77 4063691471
+1992 78 6240883199
+1993 78 6551226256
+1994 78 6647824791
+1995 78 6494311762
+1996 78 6358269587
+1997 78 6349078074
+1998 78 3890548095
+1992 79 6948601533
+1993 79 7058895576
+1994 79 7280306702
+1995 79 7174749606
+1996 79 7134521672
+1997 79 7009756092
+1998 79 4233289127
+Res Count: 280
+Time Taken Total: 24889.7
+{"query":21,"time_query":24889.7}
+==PROF== Disconnected from process 24377
+[24377] crystal_q21@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.79
+    SM Frequency            cycle/usecond       557.22
+    Elapsed Cycles                  cycle         3699
+    Memory Throughput                   %        15.18
+    DRAM Throughput                     %        15.18
+    Duration                      usecond         6.62
+    L1/TEX Cache Throughput             %         8.28
+    L2 Cache Throughput                 %         4.68
+    SM Active Cycles                cycle      2378.93
+    Compute (SM) Throughput             %         4.66
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.1 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.27
+    Executed Ipc Elapsed  inst/cycle         0.18
+    Issue Slots Busy               %         7.23
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.23
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.4%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        46.56
+    Mem Busy                     %         4.68
+    Max Bandwidth                %        15.18
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        13.69
+    Mem Pipes Busy               %         2.79
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.08
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.92
+    Active Warps Per Scheduler          warp         0.95
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.82%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.1 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.95 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.47
+    Warp Cycles Per Executed Instruction           cycle        14.22
+    Avg. Active Threads Per Warp                                15.91
+    Avg. Not Predicated Off Threads Per Warp                    15.07
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.464%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 15.9 threads being active per cycle. This is further reduced    
+          to 15.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       162.91
+    Executed Instructions                           inst        26066
+    Avg. Issued Instructions Per Scheduler          inst       171.91
+    Issued Instructions                             inst        27506
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.12
+    -------------------------------- --------------- ---------------
+
+    OPT   If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have more than the 
+          achieved 1 blocks per multiprocessor. This way, blocks that aren't waiting for __syncthreads() can keep the   
+          hardware busy.                                                                                                
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.25
+    Achieved Active Warps Per SM           warp         3.92
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 84.82%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       2.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.978%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.57
+    SM Frequency            cycle/usecond       534.54
+    Elapsed Cycles                  cycle        32009
+    Memory Throughput                   %        79.90
+    DRAM Throughput                     %        79.90
+    Duration                      usecond        59.87
+    L1/TEX Cache Throughput             %        27.26
+    L2 Cache Throughput                 %        24.16
+    SM Active Cycles                cycle     30276.90
+    Compute (SM) Throughput             %        19.29
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.81
+    Executed Ipc Elapsed  inst/cycle         0.77
+    Issue Slots Busy               %        20.39
+    Issued Ipc Active     inst/cycle         0.82
+    SM Busy                        %        20.39
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.92%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       233.51
+    Mem Busy                     %        24.16
+    Max Bandwidth                %        79.90
+    L1/TEX Hit Rate              %         0.02
+    L2 Hit Rate                  %         8.79
+    Mem Pipes Busy               %        18.39
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.395%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        18.76
+    Issued Warp Per Scheduler                        0.19
+    No Eligible                            %        81.24
+    Active Warps Per Scheduler          warp         6.71
+    Eligible Warps Per Scheduler        warp         0.25
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 20.1%                                                                                     
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 5.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.71 active warps per scheduler, but only an average of 0.25 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        35.74
+    Warp Cycles Per Executed Instruction           cycle        36.02
+    Avg. Active Threads Per Warp                                15.82
+    Avg. Not Predicated Off Threads Per Warp                    15.65
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 20.1%                                                                                           
+          On average, each warp of this kernel spends 20.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 56.3% of the total average of 35.7 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 9.855%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 15.8 threads being active per cycle. This is further reduced    
+          to 15.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      6125.61
+    Executed Instructions                           inst       980097
+    Avg. Issued Instructions Per Scheduler          inst      6173.99
+    Issued Instructions                             inst       987839
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                4.88
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.38
+    Achieved Active Warps Per SM           warp        29.24
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.12
+    Branch Instructions              inst       118258
+    Branch Efficiency                   %        72.76
+    Avg. Divergent Branches                      85.17
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.204%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 23924 excessive sectors (7% of the total  
+          360344 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.75
+    SM Frequency            cycle/usecond       557.35
+    Elapsed Cycles                  cycle         3247
+    Memory Throughput                   %         3.80
+    DRAM Throughput                     %         3.80
+    Duration                      usecond         5.82
+    L1/TEX Cache Throughput             %        30.56
+    L2 Cache Throughput                 %         2.11
+    SM Active Cycles                cycle       250.85
+    Compute (SM) Throughput             %         0.57
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.28
+    Executed Ipc Elapsed  inst/cycle         0.02
+    Issue Slots Busy               %         7.34
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.34
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95%                                                                                       
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        11.55
+    Mem Busy                     %         1.70
+    Max Bandwidth                %         3.80
+    L1/TEX Hit Rate              %         4.61
+    L2 Hit Rate                  %        50.64
+    Mem Pipes Busy               %         0.52
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.05669%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1433%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.8 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.07
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.93
+    Active Warps Per Scheduler          warp         0.93
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.93%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.1 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.93 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.22
+    Warp Cycles Per Executed Instruction           cycle        14.00
+    Avg. Active Threads Per Warp                                31.92
+    Avg. Not Predicated Off Threads Per Warp                    28.58
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        17.39
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst        18.41
+    Issued Instructions                             inst         2946
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 87.5%                                                                                           
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.18
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.82%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.801%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.58
+    Elapsed Cycles                  cycle      4476549
+    Memory Throughput                   %        96.40
+    DRAM Throughput                     %        96.40
+    Duration                      msecond         7.66
+    L1/TEX Cache Throughput             %        65.76
+    L2 Cache Throughput                 %        40.98
+    SM Active Cycles                cycle   4474138.67
+    Compute (SM) Throughput             %        23.88
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.96
+    Executed Ipc Elapsed  inst/cycle         0.96
+    Issue Slots Busy               %        23.89
+    Issued Ipc Active     inst/cycle         0.96
+    SM Busy                        %        23.89
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.24%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       308.00
+    Mem Busy                     %        40.98
+    Max Bandwidth                %        96.40
+    L1/TEX Hit Rate              %        18.41
+    L2 Hit Rate                  %        53.82
+    Mem Pipes Busy               %        15.02
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.444%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.8 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 8.3 sectors per request, or 8.3*32 = 266.7 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.8 byte accesses would result in 4.8*32 = 155.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 26.48%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.4 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.2164%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 45.09%                                                                                          
+          The memory access pattern for loads from device memory causes 49,240,820 sectors to be read from DRAM, which  
+          is 1.2x of the 39,614,628 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        23.88
+    Issued Warp Per Scheduler                        0.24
+    No Eligible                            %        76.12
+    Active Warps Per Scheduler          warp         7.13
+    Eligible Warps Per Scheduler        warp         0.32
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 3.604%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.2 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          7.13 active warps per scheduler, but only an average of 0.32 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        29.87
+    Warp Cycles Per Executed Instruction           cycle        29.87
+    Avg. Active Threads Per Warp                                20.29
+    Avg. Not Predicated Off Threads Per Warp                    18.72
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 3.604%                                                                                          
+          On average, each warp of this kernel spends 20.7 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 69.5% of the total average of 29.9 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 9.908%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.3 threads being active per cycle. This is further reduced    
+          to 18.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1068929.84
+    Executed Instructions                           inst    171028774
+    Avg. Issued Instructions Per Scheduler          inst   1068981.37
+    Issued Instructions                             inst    171037019
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                              366.13
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        89.35
+    Achieved Active Warps Per SM           warp        28.59
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 3.604%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (89.4%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst     19141534
+    Branch Efficiency                   %        68.81
+    Avg. Divergent Branches                   16445.46
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 58.56%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 60857602 excessive sectors (59% of the    
+          total 103860683 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source      
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal/crystal_q31_sf10.txt b/results/T4/crystal/crystal_q31_sf10.txt
new file mode 100644
index 0000000..857d98e
--- /dev/null
+++ b/results/T4/crystal/crystal_q31_sf10.txt
@@ -0,0 +1,906 @@
+==PROF== Connected to process 24967 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal/src/crystal_q31)
+Using device 0: Tesla T4 (PTX version 750, SM750, 40 SMs, 14802 free / 14929 total MB physmem, 320.064 GB/s @ 5001000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 35 passes
+Result:
+1992 8 8 53664098547
+1993 8 8 53051563726
+1994 8 8 53551966681
+1995 8 8 53338395993
+1996 8 8 53781237952
+1997 8 8 53558132271
+1992 9 8 55867859815
+1993 9 8 55345162638
+1994 9 8 55589883121
+1995 9 8 54871630692
+1996 9 8 55620205618
+1997 9 8 54852742519
+1992 12 8 52867359425
+1993 12 8 53435367523
+1994 12 8 52283824959
+1995 12 8 52956472988
+1996 12 8 52948768521
+1997 12 8 52962165616
+1992 18 8 53592522758
+1993 18 8 52996000810
+1994 18 8 52962120320
+1995 18 8 53924104344
+1996 18 8 53634737856
+1997 18 8 54307983851
+1992 21 8 53816502394
+1993 21 8 54349264842
+1994 21 8 54119359035
+1995 21 8 53961984627
+1996 21 8 54294333705
+1997 21 8 53703384515
+1992 8 9 55444214883
+1993 8 9 55740793389
+1994 8 9 55137400588
+1995 8 9 55784172640
+1996 8 9 56378453713
+1997 8 9 55399009353
+1992 9 9 57271740148
+1993 9 9 58216495642
+1994 9 9 57507217082
+1995 9 9 57860170696
+1996 9 9 58662284841
+1997 9 9 56940173344
+1992 12 9 55432874858
+1993 12 9 55398755151
+1994 12 9 55206960389
+1995 12 9 55581754250
+1996 12 9 55487324569
+1997 12 9 53582297974
+1992 18 9 56370007920
+1993 18 9 56166403334
+1994 18 9 55432079732
+1995 18 9 55973419507
+1996 18 9 56254723722
+1997 18 9 55830709236
+1992 21 9 56359335068
+1993 21 9 56885558074
+1994 21 9 56507097670
+1995 21 9 57465742525
+1996 21 9 56177166557
+1997 21 9 56333135444
+1992 8 12 51295873912
+1993 8 12 52384079867
+1994 8 12 52254716872
+1995 8 12 51669051730
+1996 8 12 52670597733
+1997 8 12 53782563068
+1992 9 12 54255769995
+1993 9 12 53477912258
+1994 9 12 53868848846
+1995 9 12 54310027205
+1996 9 12 55409865859
+1997 9 12 54099065304
+1992 12 12 52584065821
+1993 12 12 52637339531
+1994 12 12 50154194273
+1995 12 12 51904425056
+1996 12 12 52493537142
+1997 12 12 50634790895
+1992 18 12 52896145835
+1993 18 12 53112435531
+1994 18 12 52021625515
+1995 18 12 52031180987
+1996 18 12 53022298730
+1997 18 12 53294469049
+1992 21 12 53284643553
+1993 21 12 53900783410
+1994 21 12 53648011682
+1995 21 12 53376554374
+1996 21 12 52174060166
+1997 21 12 52785883863
+1992 8 18 51873441494
+1993 8 18 51961213538
+1994 8 18 52868608376
+1995 8 18 52738284867
+1996 8 18 51678789303
+1997 8 18 51787339279
+1992 9 18 53893325353
+1993 9 18 54178339670
+1994 9 18 54059232642
+1995 9 18 53920766480
+1996 9 18 54128092218
+1997 9 18 54349079982
+1992 12 18 51449505308
+1993 12 18 51384752707
+1994 12 18 52195482938
+1995 12 18 51205040497
+1996 12 18 51165908280
+1997 12 18 52167794260
+1992 18 18 53246367726
+1993 18 18 52211194809
+1994 18 18 52388807873
+1995 18 18 52459889035
+1996 18 18 53737304610
+1997 18 18 52772297391
+1992 21 18 53752784633
+1993 21 18 53723459056
+1994 21 18 52734575706
+1995 21 18 52810670641
+1996 21 18 53606892262
+1997 21 18 52841307001
+1992 8 21 49589186930
+1993 8 21 50874540178
+1994 8 21 50484052905
+1995 8 21 50476123376
+1996 8 21 51102099810
+1997 8 21 51376581082
+1992 9 21 51183086614
+1993 9 21 51849557513
+1994 9 21 51912335762
+1995 9 21 51737313715
+1996 9 21 52987320706
+1997 9 21 51870436294
+1992 12 21 49502367103
+1993 12 21 49962826767
+1994 12 21 50112754286
+1995 12 21 48732674673
+1996 12 21 50123146827
+1997 12 21 49094088315
+1992 18 21 50957655153
+1993 18 21 50627753769
+1994 18 21 50537890156
+1995 18 21 50265160335
+1996 18 21 50774431442
+1997 18 21 51103107061
+1992 21 21 49934446612
+1993 21 21 51562382531
+1994 21 21 50180119681
+1995 21 21 51221558310
+1996 21 21 50423672514
+1997 21 21 50461561884
+Res Count: 150
+Time Taken Total: 24941.5
+{"query":31,"time_query":24941.5}
+==PROF== Disconnected from process 24967
+[24967] crystal_q31@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.63
+    SM Frequency            cycle/usecond       541.52
+    Elapsed Cycles                  cycle         3938
+    Memory Throughput                   %        22.18
+    DRAM Throughput                     %        22.18
+    Duration                      usecond         7.26
+    L1/TEX Cache Throughput             %        15.93
+    L2 Cache Throughput                 %         7.83
+    SM Active Cycles                cycle      2644.45
+    Compute (SM) Throughput             %         4.96
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.1 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.28
+    Executed Ipc Elapsed  inst/cycle         0.19
+    Issue Slots Busy               %         7.37
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.37
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.41%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        65.78
+    Mem Busy                     %         7.83
+    Max Bandwidth                %        22.18
+    L1/TEX Hit Rate              %         1.02
+    L2 Hit Rate                  %        28.19
+    Mem Pipes Busy               %         4.22
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.0902%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 153.8 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.6728%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.15
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.85
+    Active Warps Per Scheduler          warp         0.94
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 77.82%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.0 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.94 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.21
+    Warp Cycles Per Executed Instruction           cycle        13.85
+    Avg. Active Threads Per Warp                                16.44
+    Avg. Not Predicated Off Threads Per Warp                    15.71
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.523%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.4 threads being active per cycle. This is further reduced    
+          to 15.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       185.97
+    Executed Instructions                           inst        29755
+    Avg. Issued Instructions Per Scheduler          inst       194.97
+    Issued Instructions                             inst        31195
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.12
+    -------------------------------- --------------- ---------------
+
+    OPT   If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have more than the 
+          achieved 1 blocks per multiprocessor. This way, blocks that aren't waiting for __syncthreads() can keep the   
+          hardware busy.                                                                                                
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.21
+    Achieved Active Warps Per SM           warp         3.91
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 77.82%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         3201
+    Branch Efficiency                   %        63.81
+    Avg. Divergent Branches                       2.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 15.44%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 4534 excessive sectors (34% of the total  
+          13506 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.     
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.64
+    SM Frequency            cycle/usecond       543.99
+    Elapsed Cycles                  cycle        18058
+    Memory Throughput                   %        74.90
+    DRAM Throughput                     %        74.90
+    Duration                      usecond        33.18
+    L1/TEX Cache Throughput             %        53.71
+    L2 Cache Throughput                 %        24.22
+    SM Active Cycles                cycle     15998.55
+    Compute (SM) Throughput             %        15.52
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.69
+    Executed Ipc Elapsed  inst/cycle         0.61
+    Issue Slots Busy               %        17.52
+    Issued Ipc Active     inst/cycle         0.70
+    SM Busy                        %        17.52
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 88.65%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       222.62
+    Mem Busy                     %        24.22
+    Max Bandwidth                %        74.90
+    L1/TEX Hit Rate              %         0.24
+    L2 Hit Rate                  %        23.65
+    Mem Pipes Busy               %        13.63
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.3067%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 155.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.307%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.98
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.02
+    Active Warps Per Scheduler          warp         6.06
+    Eligible Warps Per Scheduler        warp         0.23
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 25.1%                                                                                     
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.06 active warps per scheduler, but only an average of 0.23 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        37.89
+    Warp Cycles Per Executed Instruction           cycle        38.55
+    Avg. Active Threads Per Warp                                16.28
+    Avg. Not Predicated Off Threads Per Warp                    15.59
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 25.1%                                                                                           
+          On average, each warp of this kernel spends 16.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 42.4% of the total average of 37.9 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.958%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.3 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2754.01
+    Executed Instructions                           inst       440642
+    Avg. Issued Instructions Per Scheduler          inst      2802.24
+    Issued Instructions                             inst       448359
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                1.83
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        82.88
+    Achieved Active Warps Per SM           warp        26.52
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 17.12%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (82.9%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46902
+    Branch Efficiency                   %        62.57
+    Avg. Divergent Branches                      43.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 29.42%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68642 excessive sectors (34% of the total 
+          203290 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.68
+    SM Frequency            cycle/usecond       549.33
+    Elapsed Cycles                  cycle         3604
+    Memory Throughput                   %         3.14
+    DRAM Throughput                     %         3.14
+    Duration                      usecond         6.56
+    L1/TEX Cache Throughput             %        23.36
+    L2 Cache Throughput                 %         1.68
+    SM Active Cycles                cycle       290.40
+    Compute (SM) Throughput             %         0.67
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.32
+    Executed Ipc Elapsed  inst/cycle         0.03
+    Issue Slots Busy               %         8.37
+    Issued Ipc Active     inst/cycle         0.33
+    SM Busy                        %         8.37
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 94.63%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         9.40
+    Mem Busy                     %         1.41
+    Max Bandwidth                %         3.14
+    L1/TEX Hit Rate              %         2.35
+    L2 Hit Rate                  %        61.08
+    Mem Pipes Busy               %         0.44
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.04623%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.0 sectors per request, or 9.0*32 = 289.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1381%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         8.12
+    Issued Warp Per Scheduler                        0.08
+    No Eligible                            %        91.88
+    Active Warps Per Scheduler          warp         0.94
+    Eligible Warps Per Scheduler        warp         0.08
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.88%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 12.3 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.94 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        11.60
+    Warp Cycles Per Executed Instruction           cycle        12.17
+    Avg. Active Threads Per Warp                                31.82
+    Avg. Not Predicated Off Threads Per Warp                    28.46
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        23.18
+    Executed Instructions                           inst         3709
+    Avg. Issued Instructions Per Scheduler          inst        24.31
+    Issued Instructions                             inst         3889
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 87.5%                                                                                           
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.19
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.81%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst          373
+    Branch Efficiency                   %        97.40
+    Avg. Divergent Branches                       0.03
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.243%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 700 excessive sectors (37% of the total   
+          1888 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.73
+    Elapsed Cycles                  cycle      4296541
+    Memory Throughput                   %        76.90
+    DRAM Throughput                     %        76.90
+    Duration                      msecond         7.35
+    L1/TEX Cache Throughput             %        87.52
+    L2 Cache Throughput                 %        48.70
+    SM Active Cycles                cycle   4295603.38
+    Compute (SM) Throughput             %        30.92
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.24
+    Executed Ipc Elapsed  inst/cycle         1.24
+    Issue Slots Busy               %        30.93
+    Issued Ipc Active     inst/cycle         1.24
+    SM Busy                        %        30.93
+    -------------------- ----------- ------------
+
+    INF   FMA is the highest-utilized pipeline (21.6%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD)    
+          operations. It is well-utilized, but should not be a bottleneck.                                              
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       245.76
+    Mem Busy                     %        48.70
+    Max Bandwidth                %        76.90
+    L1/TEX Hit Rate              %        11.71
+    L2 Hit Rate                  %        65.13
+    Mem Pipes Busy               %        20.07
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.092%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          5.4 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 7.7 sectors per request, or 7.7*32 = 245.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 5.4 byte accesses would result in 5.4*32 = 173.8 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 30.49%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 1.332%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.1 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 28.52%                                                                                          
+          The memory access pattern for loads from device memory causes 38,230,266 sectors to be read from DRAM, which  
+          is 1.1x of the 34,106,093 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        30.93
+    Issued Warp Per Scheduler                        0.31
+    No Eligible                            %        69.07
+    Active Warps Per Scheduler          warp         7.22
+    Eligible Warps Per Scheduler        warp         0.46
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 23.1%                                                                                     
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 3.2 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          7.22 active warps per scheduler, but only an average of 0.46 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        23.35
+    Warp Cycles Per Executed Instruction           cycle        23.35
+    Avg. Active Threads Per Warp                                17.81
+    Avg. Not Predicated Off Threads Per Warp                    16.41
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 23.1%                                                                                           
+          On average, each warp of this kernel spends 12.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 51.9% of the total average of 23.3 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 15.06%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 17.8 threads being active per cycle. This is further reduced    
+          to 16.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1328610.24
+    Executed Instructions                           inst    212577639
+    Avg. Issued Instructions Per Scheduler          inst   1328659.56
+    Issued Instructions                             inst    212585530
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              31
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                              366.13
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        90.44
+    Achieved Active Warps Per SM           warp        28.94
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst     20529640
+    Branch Efficiency                   %        54.31
+    Avg. Divergent Branches                   24087.53
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 49.71%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 53746246 excessive sectors (50% of the    
+          total 108085518 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source      
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/T4/crystal/crystal_q41_sf10.txt b/results/T4/crystal/crystal_q41_sf10.txt
new file mode 100644
index 0000000..cad4822
--- /dev/null
+++ b/results/T4/crystal/crystal_q41_sf10.txt
@@ -0,0 +1,951 @@
+==PROF== Connected to process 25421 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal/src/crystal_q41)
+Using device 0: Tesla T4 (PTX version 750, SM750, 40 SMs, 14802 free / 14929 total MB physmem, 320.064 GB/s @ 5001000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_p" - 2: 0%....50%....100% - 35 passes
+==PROF== Profiling "build_hashtable_d" - 3: 0%....50%....100% - 35 passes
+==PROF== Profiling "probe" - 4: 0%....50%....100% - 35 passes
+Result:
+1992 1 103225040658
+1993 1 105193302842
+1994 1 103837804124
+1995 1 103659981621
+1996 1 103616722233
+1997 1 103157005314
+1998 1 61159340206
+1992 2 106678259181
+1993 2 105849020253
+1994 2 106216978529
+1995 2 107035371791
+1996 2 105292362331
+1997 2 105381211263
+1998 2 61616122837
+1992 3 106953585129
+1993 3 106242432020
+1994 3 105405953212
+1995 3 106496045663
+1996 3 106452120723
+1997 3 106618275297
+1998 3 61766210322
+1992 17 103623138817
+1993 17 104974876956
+1994 17 103731557899
+1995 17 103730419480
+1996 17 104874194133
+1997 17 102847514868
+1998 17 61002354487
+1992 24 106223564390
+1993 24 105649036141
+1994 24 106076726307
+1995 24 105177111217
+1996 24 103976579696
+1997 24 104638539353
+1998 24 60962148771
+Res Count: 35
+Time Taken Total: 34747.7
+{"query":41,"time_query":34747.7}
+==PROF== Disconnected from process 25421
+[25421] crystal_q41@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.71
+    SM Frequency            cycle/usecond       553.17
+    Elapsed Cycles                  cycle         3672
+    Memory Throughput                   %        15.36
+    DRAM Throughput                     %        15.36
+    Duration                      usecond         6.62
+    L1/TEX Cache Throughput             %         8.34
+    L2 Cache Throughput                 %         4.70
+    SM Active Cycles                cycle      2373.12
+    Compute (SM) Throughput             %         4.69
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.1 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.27
+    Executed Ipc Elapsed  inst/cycle         0.18
+    Issue Slots Busy               %         7.24
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.24
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.39%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        46.34
+    Mem Busy                     %         4.70
+    Max Bandwidth                %        15.36
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        13.50
+    Mem Pipes Busy               %         2.81
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.09
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.91
+    Active Warps Per Scheduler          warp         0.95
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.64%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.1 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.95 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.40
+    Warp Cycles Per Executed Instruction           cycle        14.14
+    Avg. Active Threads Per Warp                                15.91
+    Avg. Not Predicated Off Threads Per Warp                    15.07
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.482%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 15.9 threads being active per cycle. This is further reduced    
+          to 15.1 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst       162.91
+    Executed Instructions                           inst        26066
+    Avg. Issued Instructions Per Scheduler          inst       171.91
+    Issued Instructions                             inst        27506
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.12
+    -------------------------------- --------------- ---------------
+
+    OPT   If you execute __syncthreads() to synchronize the threads of a block, it is recommended to have more than the 
+          achieved 1 blocks per multiprocessor. This way, blocks that aren't waiting for __syncthreads() can keep the   
+          hardware busy.                                                                                                
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.27
+    Achieved Active Warps Per SM           warp         3.92
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 84.64%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.3%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       2.93
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.017%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.63
+    SM Frequency            cycle/usecond       542.35
+    Elapsed Cycles                  cycle        18159
+    Memory Throughput                   %        74.19
+    DRAM Throughput                     %        74.19
+    Duration                      usecond        33.47
+    L1/TEX Cache Throughput             %        53.01
+    L2 Cache Throughput                 %        24.00
+    SM Active Cycles                cycle     15984.70
+    Compute (SM) Throughput             %        15.43
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.69
+    Executed Ipc Elapsed  inst/cycle         0.61
+    Issue Slots Busy               %        17.52
+    Issued Ipc Active     inst/cycle         0.70
+    SM Busy                        %        17.52
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 88.65%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       220.06
+    Mem Busy                     %        24.00
+    Max Bandwidth                %        74.19
+    L1/TEX Hit Rate              %         0.21
+    L2 Hit Rate                  %        23.56
+    Mem Pipes Busy               %        13.55
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.2943%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.275%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.83
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.17
+    Active Warps Per Scheduler          warp         6.01
+    Eligible Warps Per Scheduler        warp         0.22
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 25.81%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.01 active warps per scheduler, but only an average of 0.22 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        37.99
+    Warp Cycles Per Executed Instruction           cycle        38.65
+    Avg. Active Threads Per Warp                                16.24
+    Avg. Not Predicated Off Threads Per Warp                    15.56
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 25.81%                                                                                          
+          On average, each warp of this kernel spends 16.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 42.3% of the total average of 38.0 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.928%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.2 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2753.09
+    Executed Instructions                           inst       440494
+    Avg. Issued Instructions Per Scheduler          inst      2801.24
+    Issued Instructions                             inst       448198
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                1.83
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        83.74
+    Achieved Active Warps Per SM           warp        26.80
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 16.26%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (83.7%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46897
+    Branch Efficiency                   %        62.60
+    Avg. Divergent Branches                      43.90
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 28.9%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 68196 excessive sectors (34% of the total 
+          202634 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.57
+    SM Frequency            cycle/usecond       535.29
+    Elapsed Cycles                  cycle        28236
+    Memory Throughput                   %        84.32
+    DRAM Throughput                     %        84.32
+    Duration                      usecond        52.74
+    L1/TEX Cache Throughput             %        53.81
+    L2 Cache Throughput                 %        23.98
+    SM Active Cycles                cycle     26492.95
+    Compute (SM) Throughput             %        25.22
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.07
+    Executed Ipc Elapsed  inst/cycle         1.00
+    Issue Slots Busy               %        26.88
+    Issued Ipc Active     inst/cycle         1.08
+    SM Busy                        %        26.88
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 82.01%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       246.76
+    Mem Busy                     %        23.43
+    Max Bandwidth                %        84.32
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         5.49
+    Mem Pipes Busy               %        14.39
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        24.68
+    Issued Warp Per Scheduler                        0.25
+    No Eligible                            %        75.32
+    Active Warps Per Scheduler          warp         6.68
+    Eligible Warps Per Scheduler        warp         0.35
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 15.68%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.1 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          6.68 active warps per scheduler, but only an average of 0.35 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        27.06
+    Warp Cycles Per Executed Instruction           cycle        27.24
+    Avg. Active Threads Per Warp                                20.65
+    Avg. Not Predicated Off Threads Per Warp                    19.23
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 15.68%                                                                                          
+          On average, each warp of this kernel spends 12.9 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 47.6% of the total average of 27.1 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 10.06%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.6 threads being active per cycle. This is further reduced    
+          to 19.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      7071.98
+    Executed Instructions                           inst      1131516
+    Avg. Issued Instructions Per Scheduler          inst      7120.14
+    Issued Instructions                             inst      1139222
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              20
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                4.88
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.61
+    Achieved Active Warps Per SM           warp        29.31
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst       125048
+    Branch Efficiency                   %        62.52
+    Avg. Divergent Branches                     117.20
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 17.23%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 57145 excessive sectors (19% of the total 
+          308036 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.75
+    SM Frequency            cycle/usecond       555.22
+    Elapsed Cycles                  cycle         3236
+    Memory Throughput                   %         3.73
+    DRAM Throughput                     %         3.73
+    Duration                      usecond         5.82
+    L1/TEX Cache Throughput             %        30.60
+    L2 Cache Throughput                 %         2.13
+    SM Active Cycles                cycle       251.35
+    Compute (SM) Throughput             %         0.57
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.28
+    Executed Ipc Elapsed  inst/cycle         0.02
+    Issue Slots Busy               %         7.33
+    Issued Ipc Active     inst/cycle         0.29
+    SM Busy                        %         7.33
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.01%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        11.32
+    Mem Busy                     %         1.71
+    Max Bandwidth                %         3.73
+    L1/TEX Hit Rate              %         4.97
+    L2 Hit Rate                  %        50.66
+    Mem Pipes Busy               %         0.53
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.05653%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1445%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.8 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         7.13
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        92.87
+    Active Warps Per Scheduler          warp         0.96
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.87%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 14.0 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          0.96 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.47
+    Warp Cycles Per Executed Instruction           cycle        14.26
+    Avg. Active Threads Per Warp                                31.92
+    Avg. Not Predicated Off Threads Per Warp                    28.58
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        17.39
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst        18.41
+    Issued Instructions                             inst         2946
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              26
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 87.5%                                                                                           
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 40              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        12.17
+    Achieved Active Warps Per SM           warp         3.89
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.83%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (12.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 8.616%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.99
+    SM Frequency            cycle/usecond       584.66
+    Elapsed Cycles                  cycle      5303092
+    Memory Throughput                   %        96.15
+    DRAM Throughput                     %        96.15
+    Duration                      msecond         9.07
+    L1/TEX Cache Throughput             %        57.69
+    L2 Cache Throughput                 %        40.36
+    SM Active Cycles                cycle      5299679
+    Compute (SM) Throughput             %        27.57
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of 
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.10
+    Executed Ipc Elapsed  inst/cycle         1.10
+    Issue Slots Busy               %        27.59
+    Issued Ipc Active     inst/cycle         1.10
+    SM Busy                        %        27.59
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 81.62%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       307.26
+    Mem Busy                     %        40.36
+    Max Bandwidth                %        96.15
+    L1/TEX Hit Rate              %        16.85
+    L2 Hit Rate                  %        47.36
+    Mem Pipes Busy               %        18.30
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.9761%                                                                                         
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.7 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 6.6 sectors per request, or 6.6*32 = 211.5 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.7 byte accesses would result in 4.7*32 = 150.1 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 24.27%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.2971%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 51.62%                                                                                          
+          The memory access pattern for loads from device memory causes 60,392,884 sectors to be read from DRAM, which  
+          is 1.1x of the 52,684,372 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        27.57
+    Issued Warp Per Scheduler                        0.28
+    No Eligible                            %        72.43
+    Active Warps Per Scheduler          warp         7.07
+    Eligible Warps Per Scheduler        warp         0.38
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 3.849%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 3.6 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of    
+          7.07 active warps per scheduler, but only an average of 0.38 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        25.65
+    Warp Cycles Per Executed Instruction           cycle        25.65
+    Avg. Active Threads Per Warp                                17.89
+    Avg. Not Predicated Off Threads Per Warp                    16.69
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 3.849%                                                                                          
+          On average, each warp of this kernel spends 17.2 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 67.1% of the total average of 25.7 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 13.19%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 17.9 threads being active per cycle. This is further reduced    
+          to 16.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst   1462103.29
+    Executed Instructions                           inst    233936527
+    Avg. Issued Instructions Per Scheduler          inst   1462162.15
+    Issued Instructions                             inst    233945944
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              29
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                              366.13
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           16
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block            8
+    Theoretical Active Warps per SM        warp           32
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        88.62
+    Achieved Active Warps Per SM           warp        28.36
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 3.849%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (88.6%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.12
+    Branch Instructions              inst     26921784
+    Branch Efficiency                   %        63.70
+    Avg. Divergent Branches                   26582.10
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 49.35%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 58762179 excessive sectors (49% of the    
+          total 118979581 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source      
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal-fls/crystal_fls_q21_sf10_2.txt b/results/V100/crystal-fls/crystal_fls_q21_sf10_2.txt
new file mode 100644
index 0000000..6ab8bdf
--- /dev/null
+++ b/results/V100/crystal-fls/crystal_fls_q21_sf10_2.txt
@@ -0,0 +1,1037 @@
+==PROF== Connected to process 3256 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q21_bitpacked)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 74 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 74 passes
+1992 40 5910703807
+1993 40 6221118002
+1994 40 5930589067
+1995 40 5935176587
+1996 40 5813459646
+1997 40 5932327551
+1998 40 3617033909
+1992 41 6142605437
+1993 41 6513938496
+1994 41 6255096718
+1995 41 6188290908
+1996 41 6088744091
+1997 41 6245316071
+1998 41 3627954097
+1992 42 5899765766
+1993 42 6325451795
+1994 42 6379855056
+1995 42 6253125905
+1996 42 6108666329
+1997 42 6157934476
+1998 42 3558798602
+1992 43 5806898037
+1993 43 5770931893
+1994 43 6096087079
+1995 43 6065752404
+1996 43 6002900479
+1997 43 5860606190
+1998 43 3678545331
+1992 44 5559682659
+1993 44 5813306579
+1994 44 5926068761
+1995 44 5608176605
+1996 44 5735975188
+1997 44 5836274168
+1998 44 3134706225
+1992 45 6474983674
+1993 45 6400588001
+1994 45 6331198167
+1995 45 6394371935
+1996 45 6559249979
+1997 45 6645487151
+1998 45 3850121319
+1992 46 6587090794
+1993 46 6382032832
+1994 46 6775614290
+1995 46 6442574114
+1996 46 6632812978
+1997 46 6814132782
+1998 46 3834827103
+1992 47 6808158717
+1993 47 6351643075
+1994 47 6804633795
+1995 47 6088726153
+1996 47 6623642056
+1997 47 6639575295
+1998 47 3406959500
+1992 48 6911435378
+1993 48 7053764786
+1994 48 6774194398
+1995 48 6814377370
+1996 48 6711754718
+1997 48 6709492543
+1998 48 4023772218
+1992 49 5727394828
+1993 49 5660353157
+1994 49 5642438266
+1995 49 5677870960
+1996 49 5681672438
+1997 49 5832554864
+1998 49 3345738545
+1992 50 6641309502
+1993 50 6681847719
+1994 50 6374542648
+1995 50 6686329221
+1996 50 6841710204
+1997 50 6289013167
+1998 50 3751716318
+1992 51 6804275373
+1993 51 6208468595
+1994 51 6046395349
+1995 51 6352880587
+1996 51 6285475695
+1997 51 6393365859
+1998 51 3535193333
+1992 52 6053138673
+1993 52 6270772376
+1994 52 6156757875
+1995 52 6158310037
+1996 52 6164411328
+1997 52 6113882230
+1998 52 3381521312
+1992 53 6221067837
+1993 53 5932049757
+1994 53 6175099472
+1995 53 6256597213
+1996 53 6265574087
+1997 53 6452419277
+1998 53 3298965819
+1992 54 6085873522
+1993 54 6268707214
+1994 54 6109955822
+1995 54 6011700445
+1996 54 6233626966
+1997 54 5902666460
+1998 54 3464767326
+1992 55 6589852474
+1993 55 6507948375
+1994 55 6707389575
+1995 55 6118847814
+1996 55 6369111228
+1997 55 6161915041
+1998 55 3610193272
+1992 56 6129000282
+1993 56 5790679619
+1994 56 5826402917
+1995 56 5908836912
+1996 56 5616763903
+1997 56 5902947686
+1998 56 3112058250
+1992 57 5824635290
+1993 57 5876999663
+1994 57 5484431421
+1995 57 5880695547
+1996 57 5815600477
+1997 57 5642596426
+1998 57 3179720586
+1992 58 5993521981
+1993 58 5698429434
+1994 58 6045778708
+1995 58 5596770464
+1996 58 5602902570
+1997 58 5827168921
+1998 58 3614692390
+1992 59 5901946523
+1993 59 5848707519
+1994 59 6043292500
+1995 59 5689679375
+1996 59 5658105294
+1997 59 5744356971
+1998 59 3517431277
+1992 60 6253547701
+1993 60 6295488516
+1994 60 6247585910
+1995 60 5946652692
+1996 60 6332958799
+1997 60 6426981826
+1998 60 3538237841
+1992 61 6523908450
+1993 61 6266002951
+1994 61 6229473288
+1995 61 6433574643
+1996 61 6470033667
+1997 61 6160852695
+1998 61 3815286652
+1992 62 6155409813
+1993 62 5944781347
+1994 62 5647531260
+1995 62 6146349885
+1996 62 5874259231
+1997 62 5771092581
+1998 62 3848799359
+1992 63 6015144810
+1993 63 6644358780
+1994 63 6303769219
+1995 63 6487157609
+1996 63 6260201621
+1997 63 5936323834
+1998 63 3746060156
+1992 64 6605730121
+1993 64 6375799970
+1994 64 6362984117
+1995 64 6166610415
+1996 64 6298505754
+1997 64 6795100051
+1998 64 4038091656
+1992 65 7082841759
+1993 65 7045037152
+1994 65 6308495084
+1995 65 6451506098
+1996 65 6985524790
+1997 65 7045234117
+1998 65 4103210270
+1992 66 6587314235
+1993 66 6717880192
+1994 66 6931875539
+1995 66 6823692842
+1996 66 6778966019
+1997 66 6938134368
+1998 66 3996123557
+1992 67 6456838047
+1993 67 6496616174
+1994 67 6208530373
+1995 67 6304469051
+1996 67 5895856332
+1997 67 6512779478
+1998 67 3797955327
+1992 68 6482867338
+1993 68 6522458684
+1994 68 6688539088
+1995 68 6975990289
+1996 68 6734296742
+1997 68 6443099863
+1998 68 3696911793
+1992 69 5850155722
+1993 69 6159169202
+1994 69 6179753713
+1995 69 6341671954
+1996 69 5846097201
+1997 69 6138039450
+1998 69 3430651458
+1992 70 6527884145
+1993 70 6341113918
+1994 70 6423480229
+1995 70 6493732171
+1996 70 6558022303
+1997 70 6502432501
+1998 70 4138272278
+1992 71 5977566231
+1993 71 6246083360
+1994 71 6589979243
+1995 71 6362728981
+1996 71 6331903373
+1997 71 6317891561
+1998 71 3970794721
+1992 72 6162472854
+1993 72 6102788096
+1994 72 5758527506
+1995 72 6073178181
+1996 72 5787199821
+1997 72 5752890900
+1998 72 3650913701
+1992 73 6040081567
+1993 73 5700964701
+1994 73 6185333070
+1995 73 6013769902
+1996 73 5662668717
+1997 73 5735470645
+1998 73 3598396119
+1992 74 6559831216
+1993 74 6527962564
+1994 74 6120118269
+1995 74 6043130080
+1996 74 6314023403
+1997 74 6351296659
+1998 74 3833513192
+1992 75 6063071801
+1993 75 6063519843
+1994 75 5998734727
+1995 75 5943670683
+1996 75 5750034048
+1997 75 6363803744
+1998 75 3522552499
+1992 76 6499618869
+1993 76 6633266405
+1994 76 6753951730
+1995 76 6501112403
+1996 76 6848513046
+1997 76 6758217754
+1998 76 3831187933
+1992 77 6306363056
+1993 77 6140348079
+1994 77 6221542876
+1995 77 6546487305
+1996 77 6274545265
+1997 77 6065634372
+1998 77 3740445348
+1992 78 5440997205
+1993 78 5646255796
+1994 78 5621336499
+1995 78 5559009358
+1996 78 5714095414
+1997 78 5668627674
+1998 78 3164884845
+1992 79 6429617430
+1993 79 6453234687
+1994 79 6437225149
+1995 79 6089389380
+1996 79 6421073409
+1997 79 6218229922
+1998 79 3547510598
+Res Count: 280
+Time Taken Total: 4492.53
+{"query":21,"time_query":4492.18}
+==PROF== Disconnected from process 3256
+[3256] fls_q21_bitpacked@127.0.0.1
+  void build_hashtable_s<(int)32, (int)32>(int *, int *, int, int *, int) (20, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       504.72
+    SM Frequency            cycle/usecond       753.48
+    Elapsed Cycles                  cycle        10855
+    Memory Throughput                   %         3.20
+    DRAM Throughput                     %         3.20
+    Duration                      usecond        14.40
+    L1/TEX Cache Throughput             %         4.03
+    L2 Cache Throughput                 %         1.20
+    SM Active Cycles                cycle      2227.31
+    Compute (SM) Throughput             %         0.65
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.13
+    Executed Ipc Elapsed  inst/cycle         0.03
+    Issue Slots Busy               %         3.17
+    Issued Ipc Active     inst/cycle         0.13
+    SM Busy                        %         3.17
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.67%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        16.55
+    Mem Busy                     %         1.10
+    Max Bandwidth                %         3.20
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         4.11
+    Mem Pipes Busy               %         0.22
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.33
+    Issued Warp Per Scheduler                        0.12
+    No Eligible                            %        87.67
+    Active Warps Per Scheduler          warp         0.97
+    Eligible Warps Per Scheduler        warp         0.12
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.67%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 8.1 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.97 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         7.85
+    Warp Cycles Per Executed Instruction           cycle         7.90
+    Avg. Active Threads Per Warp                                13.37
+    Avg. Not Predicated Off Threads Per Warp                    12.38
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 43.33%                                                                                          
+          On average, each warp of this kernel spends 3.4 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 43.3% of the total average of 7.8 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.3984%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.4 threads being active per cycle. This is further reduced    
+          to 12.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst           70
+    Executed Instructions                           inst        22400
+    Avg. Issued Instructions Per Scheduler          inst        70.50
+    Issued Instructions                             inst        22561
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     20
+    Registers Per Thread             register/thread              72
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.01
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 75%                                                                                             
+          The grid for this launch is configured to execute only 20 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           28
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           28
+    Theoretical Occupancy                     %        43.75
+    Achieved Occupancy                        %         1.56
+    Achieved Active Warps Per SM           warp         1.00
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.67%                                                                                          
+          The difference between calculated theoretical (43.8%) and measured achieved occupancy (1.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 56.25%                                                                                          
+          The 7.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (43.8%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst         2027
+    Branch Efficiency                   %        13.94
+    Avg. Divergent Branches                       1.89
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.339%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)32, (int)32>(int *, int *, int *, int, int *, int) (782, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       553.78
+    SM Frequency            cycle/usecond       826.18
+    Elapsed Cycles                  cycle        24854
+    Memory Throughput                   %        67.80
+    DRAM Throughput                     %        67.80
+    Duration                      usecond        30.05
+    L1/TEX Cache Throughput             %        18.86
+    L2 Cache Throughput                 %        23.81
+    SM Active Cycles                cycle     21885.15
+    Compute (SM) Throughput             %         9.90
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.45
+    Executed Ipc Elapsed  inst/cycle         0.39
+    Issue Slots Busy               %        11.23
+    Issued Ipc Active     inst/cycle         0.45
+    SM Busy                        %        11.23
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.53%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       384.47
+    Mem Busy                     %        23.81
+    Max Bandwidth                %        67.80
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         8.60
+    Mem Pipes Busy               %         5.69
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.399%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        11.69
+    Issued Warp Per Scheduler                        0.12
+    No Eligible                            %        88.31
+    Active Warps Per Scheduler          warp         2.28
+    Eligible Warps Per Scheduler        warp         0.13
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 32.2%                                                                                     
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 8.6 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          2.28 active warps per scheduler, but only an average of 0.13 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        19.51
+    Warp Cycles Per Executed Instruction           cycle        19.58
+    Avg. Active Threads Per Warp                                12.26
+    Avg. Not Predicated Off Threads Per Warp                    12.02
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.2%                                                                                           
+          On average, each warp of this kernel spends 6.9 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 35.4% of the total average of 19.5 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 6.179%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 12.3 threads being active per cycle. This is further reduced    
+          to 12.0 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2448.54
+    Executed Instructions                           inst       783534
+    Avg. Issued Instructions Per Scheduler          inst      2457.04
+    Issued Instructions                             inst       786253
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    782
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           25024
+    Waves Per SM                                                0.61
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %        13.69
+    Achieved Active Warps Per SM           warp         8.76
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.2%                                                                                           
+          The difference between calculated theoretical (25.0%) and measured achieved occupancy (13.7%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 32.2%                                                                                           
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (25.0%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst        74518
+    Branch Efficiency                   %        37.41
+    Avg. Divergent Branches                      55.07
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.74%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 23924 excessive sectors (7% of the total  
+          360344 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)32, (int)32>(int *, int *, int, int *, int, int) (3, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       360.86
+    SM Frequency            cycle/usecond       538.19
+    Elapsed Cycles                  cycle         7615
+    Memory Throughput                   %         0.85
+    DRAM Throughput                     %         0.85
+    Duration                      usecond        14.14
+    L1/TEX Cache Throughput             %        22.33
+    L2 Cache Throughput                 %         0.73
+    SM Active Cycles                cycle       180.54
+    Compute (SM) Throughput             %         0.09
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.15
+    Executed Ipc Elapsed  inst/cycle         0.00
+    Issue Slots Busy               %         3.88
+    Issued Ipc Active     inst/cycle         0.16
+    SM Busy                        %         3.88
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.1%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         3.14
+    Mem Busy                     %         0.46
+    Max Bandwidth                %         0.85
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        40.65
+    Mem Pipes Busy               %         0.09
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.01201%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.06189%                                                                                        
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.63
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.37
+    Active Warps Per Scheduler          warp         1.00
+    Eligible Warps Per Scheduler        warp         0.16
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.37%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.00 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         6.39
+    Warp Cycles Per Executed Instruction           cycle         6.47
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    27.49
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst         6.92
+    Executed Instructions                           inst         2215
+    Avg. Issued Instructions Per Scheduler          inst         7.01
+    Issued Instructions                             inst         2243
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      3
+    Registers Per Thread             register/thread              80
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              96
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 96.25%                                                                                          
+          The grid for this launch is configured to execute only 3 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           24
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           24
+    Theoretical Occupancy                     %        37.50
+    Achieved Occupancy                        %         1.56
+    Achieved Active Warps Per SM           warp         1.00
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 84.37%                                                                                          
+          The difference between calculated theoretical (37.5%) and measured achieved occupancy (1.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 62.5%                                                                                           
+          The 6.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (37.5%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.04
+    Branch Instructions              inst           83
+    Branch Efficiency                   %        97.62
+    Avg. Divergent Branches                       0.00
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 4.351%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)32, (int)32>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       862.89
+    SM Frequency            cycle/nsecond         1.29
+    Elapsed Cycles                  cycle      2487573
+    Memory Throughput                   %        37.27
+    DRAM Throughput                     %        37.27
+    Duration                      msecond         1.93
+    L1/TEX Cache Throughput             %        20.80
+    L2 Cache Throughput                 %        19.99
+    SM Active Cycles                cycle   2458614.38
+    Compute (SM) Throughput             %        21.03
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.85
+    Executed Ipc Elapsed  inst/cycle         0.84
+    Issue Slots Busy               %        21.28
+    Issued Ipc Active     inst/cycle         0.85
+    SM Busy                        %        21.28
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.21%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       329.32
+    Mem Busy                     %        20.56
+    Max Bandwidth                %        37.27
+    L1/TEX Hit Rate              %        65.21
+    L2 Hit Rate                  %        40.66
+    Mem Pipes Busy               %         8.55
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.477%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.6 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 10.3 sectors per request, or 10.3*32 = 329.1 bytes of cache data transfers per request.   
+          The optimal thread address pattern for 4.6 byte accesses would result in 4.6*32 = 148.3 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 10.85%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.7 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.4447%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        21.28
+    Issued Warp Per Scheduler                        0.21
+    No Eligible                            %        78.72
+    Active Warps Per Scheduler          warp         2.93
+    Eligible Warps Per Scheduler        warp         0.25
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 62.73%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.7 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          2.93 active warps per scheduler, but only an average of 0.25 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.76
+    Warp Cycles Per Executed Instruction           cycle        13.76
+    Avg. Active Threads Per Warp                                20.27
+    Avg. Not Predicated Off Threads Per Warp                    18.70
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 44.73%                                                                                          
+          On average, each warp of this kernel spends 6.2 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 44.7% of the total average of 13.8 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 8.741%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.3 threads being active per cycle. This is further reduced    
+          to 18.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    523065.33
+    Executed Instructions                           inst    167380904
+    Avg. Issued Instructions Per Scheduler          inst    523086.21
+    Issued Instructions                             inst    167387586
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             152
+    Shared Memory Configuration Size           Kbyte           65.54
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.10
+    Threads                                   thread         1874592
+    Waves Per SM                                               61.02
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           12
+    Block Limit Shared Mem                block           16
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           12
+    Theoretical Occupancy                     %        18.75
+    Achieved Occupancy                        %        18.28
+    Achieved Active Warps Per SM           warp        11.70
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 62.73%                                                                                          
+          The 3.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (18.8%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst     14217715
+    Branch Efficiency                   %        54.52
+    Avg. Divergent Branches                    8326.73
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 65.18%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 56937633 excessive sectors (66% of the    
+          total 85739880 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal-fls/crystal_fls_q31_sf10.txt b/results/V100/crystal-fls/crystal_fls_q31_sf10.txt
new file mode 100644
index 0000000..68929eb
--- /dev/null
+++ b/results/V100/crystal-fls/crystal_fls_q31_sf10.txt
@@ -0,0 +1,955 @@
+==PROF== Connected to process 3120 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q31_bitpacked)
+[33m-- lo_custkey_min: 1[39m
+[33m-- lo_custkey_max: 299999[39m
+[33m-- x: 19[39m
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 74 passes
+Result:
+1992 8 8 53664098547
+1993 8 8 53051563726
+1994 8 8 53551966681
+1995 8 8 53338395993
+1996 8 8 53781237952
+1997 8 8 53558132271
+1992 9 8 55867859815
+1993 9 8 55345162638
+1994 9 8 55589883121
+1995 9 8 54871630692
+1996 9 8 55620205618
+1997 9 8 54852742519
+1992 12 8 52867359425
+1993 12 8 53435367523
+1994 12 8 52283824959
+1995 12 8 52956472988
+1996 12 8 52948768521
+1997 12 8 52962165616
+1992 18 8 53592522758
+1993 18 8 52996000810
+1994 18 8 52962120320
+1995 18 8 53924104344
+1996 18 8 53634737856
+1997 18 8 54307983851
+1992 21 8 53816502394
+1993 21 8 54349264842
+1994 21 8 54119359035
+1995 21 8 53961984627
+1996 21 8 54294333705
+1997 21 8 53703384515
+1992 8 9 55444214883
+1993 8 9 55740793389
+1994 8 9 55137400588
+1995 8 9 55784172640
+1996 8 9 56378453713
+1997 8 9 55399009353
+1992 9 9 57271740148
+1993 9 9 58216495642
+1994 9 9 57507217082
+1995 9 9 57860170696
+1996 9 9 58662284841
+1997 9 9 56940173344
+1992 12 9 55432874858
+1993 12 9 55398755151
+1994 12 9 55206960389
+1995 12 9 55581754250
+1996 12 9 55487324569
+1997 12 9 53582297974
+1992 18 9 56370007920
+1993 18 9 56166403334
+1994 18 9 55432079732
+1995 18 9 55973419507
+1996 18 9 56254723722
+1997 18 9 55830709236
+1992 21 9 56359335068
+1993 21 9 56885558074
+1994 21 9 56507097670
+1995 21 9 57465742525
+1996 21 9 56177166557
+1997 21 9 56333135444
+1992 8 12 51295873912
+1993 8 12 52384079867
+1994 8 12 52254716872
+1995 8 12 51669051730
+1996 8 12 52670597733
+1997 8 12 53782563068
+1992 9 12 54255769995
+1993 9 12 53477912258
+1994 9 12 53868848846
+1995 9 12 54310027205
+1996 9 12 55409865859
+1997 9 12 54099065304
+1992 12 12 52584065821
+1993 12 12 52637339531
+1994 12 12 50154194273
+1995 12 12 51904425056
+1996 12 12 52493537142
+1997 12 12 50634790895
+1992 18 12 52896145835
+1993 18 12 53112435531
+1994 18 12 52021625515
+1995 18 12 52031180987
+1996 18 12 53022298730
+1997 18 12 53294469049
+1992 21 12 53284643553
+1993 21 12 53900783410
+1994 21 12 53648011682
+1995 21 12 53376554374
+1996 21 12 52174060166
+1997 21 12 52785883863
+1992 8 18 51873441494
+1993 8 18 51961213538
+1994 8 18 52868608376
+1995 8 18 52738284867
+1996 8 18 51678789303
+1997 8 18 51787339279
+1992 9 18 53893325353
+1993 9 18 54178339670
+1994 9 18 54059232642
+1995 9 18 53920766480
+1996 9 18 54128092218
+1997 9 18 54349079982
+1992 12 18 51449505308
+1993 12 18 51384752707
+1994 12 18 52195482938
+1995 12 18 51205040497
+1996 12 18 51165908280
+1997 12 18 52167794260
+1992 18 18 53246367726
+1993 18 18 52211194809
+1994 18 18 52388807873
+1995 18 18 52459889035
+1996 18 18 53737304610
+1997 18 18 52772297391
+1992 21 18 53752784633
+1993 21 18 53723459056
+1994 21 18 52734575706
+1995 21 18 52810670641
+1996 21 18 53606892262
+1997 21 18 52841307001
+1992 8 21 49589186930
+1993 8 21 50874540178
+1994 8 21 50484052905
+1995 8 21 50476123376
+1996 8 21 51102099810
+1997 8 21 51376581082
+1992 9 21 51183086614
+1993 9 21 51849557513
+1994 9 21 51912335762
+1995 9 21 51737313715
+1996 9 21 52987320706
+1997 9 21 51870436294
+1992 12 21 49502367103
+1993 12 21 49962826767
+1994 12 21 50112754286
+1995 12 21 48732674673
+1996 12 21 50123146827
+1997 12 21 49094088315
+1992 18 21 50957655153
+1993 18 21 50627753769
+1994 18 21 50537890156
+1995 18 21 50265160335
+1996 18 21 50774431442
+1997 18 21 51103107061
+1992 21 21 49934446612
+1993 21 21 51562382531
+1994 21 21 50180119681
+1995 21 21 51221558310
+1996 21 21 50423672514
+1997 21 21 50461561884
+Res Count: 150
+Time Taken Total: 4859.22
+{"query":31,"time_query":4859.02}
+==PROF== Disconnected from process 3120
+[3120] fls_q31_bitpacked@127.0.0.1
+  void build_hashtable_s<(int)32, (int)32>(int *, int *, int *, int, int *, int) (20, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       486.79
+    SM Frequency            cycle/usecond       724.70
+    Elapsed Cycles                  cycle        11415
+    Memory Throughput                   %         4.75
+    DRAM Throughput                     %         4.75
+    Duration                      usecond        15.74
+    L1/TEX Cache Throughput             %         7.04
+    L2 Cache Throughput                 %         2.05
+    SM Active Cycles                cycle      2404.12
+    Compute (SM) Throughput             %         0.68
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.13
+    Executed Ipc Elapsed  inst/cycle         0.03
+    Issue Slots Busy               %         3.25
+    Issued Ipc Active     inst/cycle         0.13
+    SM Busy                        %         3.25
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.63%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        23.68
+    Mem Busy                     %         1.96
+    Max Bandwidth                %         4.75
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        23.88
+    Mem Pipes Busy               %         0.35
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.01549%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 153.8 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1885%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        13.06
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        86.94
+    Active Warps Per Scheduler          warp         1.00
+    Eligible Warps Per Scheduler        warp         0.13
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 86.94%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.7 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.00 active warps per scheduler, but only an average of 0.13 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         7.68
+    Warp Cycles Per Executed Instruction           cycle         7.73
+    Avg. Active Threads Per Warp                                13.77
+    Avg. Not Predicated Off Threads Per Warp                    12.88
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 44.32%                                                                                          
+          On average, each warp of this kernel spends 3.4 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 44.3% of the total average of 7.7 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.4087%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.8 threads being active per cycle. This is further reduced    
+          to 12.9 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        77.53
+    Executed Instructions                           inst        24810
+    Avg. Issued Instructions Per Scheduler          inst        78.03
+    Issued Instructions                             inst        24971
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     20
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.02
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 75%                                                                                             
+          The grid for this launch is configured to execute only 20 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %         1.56
+    Achieved Active Warps Per SM           warp         1.00
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 86.94%                                                                                          
+          The difference between calculated theoretical (25.0%) and measured achieved occupancy (1.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 75%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (25.0%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst         2069
+    Branch Efficiency                   %        16.30
+    Avg. Divergent Branches                       1.89
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 9.778%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 4534 excessive sectors (34% of the total  
+          13506 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.     
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)32, (int)32>(int *, int *, int *, int, int *, int) (293, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       432.18
+    SM Frequency            cycle/usecond       643.14
+    Elapsed Cycles                  cycle        15987
+    Memory Throughput                   %        50.99
+    DRAM Throughput                     %        50.99
+    Duration                      usecond        24.83
+    L1/TEX Cache Throughput             %        31.81
+    L2 Cache Throughput                 %        21.00
+    SM Active Cycles                cycle     13789.73
+    Compute (SM) Throughput             %         7.19
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.2 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.33
+    Executed Ipc Elapsed  inst/cycle         0.29
+    Issue Slots Busy               %         8.33
+    Issued Ipc Active     inst/cycle         0.33
+    SM Busy                        %         8.33
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.8%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       225.65
+    Mem Busy                     %        20.83
+    Max Bandwidth                %        50.99
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        23.38
+    Mem Pipes Busy               %         3.71
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.1734%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 155.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.022%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         9.39
+    Issued Warp Per Scheduler                        0.09
+    No Eligible                            %        90.61
+    Active Warps Per Scheduler          warp         1.00
+    Eligible Warps Per Scheduler        warp         0.09
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 49.01%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 10.6 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.00 active warps per scheduler, but only an average of 0.09 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        10.67
+    Warp Cycles Per Executed Instruction           cycle        10.73
+    Avg. Active Threads Per Warp                                13.47
+    Avg. Not Predicated Off Threads Per Warp                    12.62
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.14%                                                                                          
+          On average, each warp of this kernel spends 3.4 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 32.1% of the total average of 10.7 cycles between issuing two instructions.                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 4.356%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.5 threads being active per cycle. This is further reduced    
+          to 12.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      1141.31
+    Executed Instructions                           inst       365220
+    Avg. Issued Instructions Per Scheduler          inst      1148.64
+    Issued Instructions                             inst       367565
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    293
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            9376
+    Waves Per SM                                                0.23
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %         5.59
+    Achieved Active Warps Per SM           warp         3.58
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 49.01%                                                                                          
+          The difference between calculated theoretical (25.0%) and measured achieved occupancy (5.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 49.01%                                                                                          
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (25.0%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst        30470
+    Branch Efficiency                   %        13.96
+    Avg. Divergent Branches                      28.37
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 27.78%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68642 excessive sectors (34% of the total 
+          203290 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)32, (int)32>(int *, int *, int, int *, int, int) (3, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       395.79
+    SM Frequency            cycle/usecond       593.73
+    Elapsed Cycles                  cycle         9941
+    Memory Throughput                   %         0.61
+    DRAM Throughput                     %         0.61
+    Duration                      usecond        16.74
+    L1/TEX Cache Throughput             %        12.19
+    L2 Cache Throughput                 %         0.49
+    SM Active Cycles                cycle       289.07
+    Compute (SM) Throughput             %         0.10
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.14
+    Executed Ipc Elapsed  inst/cycle         0.00
+    Issue Slots Busy               %         3.52
+    Issued Ipc Active     inst/cycle         0.14
+    SM Busy                        %         3.52
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.39%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         2.46
+    Mem Busy                     %         0.32
+    Max Bandwidth                %         0.61
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        39.21
+    Mem Pipes Busy               %         0.06
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.008278%                                                                                       
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.0 sectors per request, or 9.0*32 = 289.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.04108%                                                                                        
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.4 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        14.72
+    Issued Warp Per Scheduler                        0.15
+    No Eligible                            %        85.28
+    Active Warps Per Scheduler          warp         1.01
+    Eligible Warps Per Scheduler        warp         0.15
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 85.28%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.01 active warps per scheduler, but only an average of 0.15 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         6.86
+    Warp Cycles Per Executed Instruction           cycle         6.91
+    Avg. Active Threads Per Warp                                31.85
+    Avg. Not Predicated Off Threads Per Warp                    27.92
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 36.83%                                                                                          
+          On average, each warp of this kernel spends 2.5 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 36.8% of the total average of 6.9 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        10.08
+    Executed Instructions                           inst         3227
+    Avg. Issued Instructions Per Scheduler          inst        10.17
+    Issued Instructions                             inst         3254
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      3
+    Registers Per Thread             register/thread             110
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              96
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 96.25%                                                                                          
+          The grid for this launch is configured to execute only 3 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %         1.56
+    Achieved Active Warps Per SM           warp            1
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 85.28%                                                                                          
+          The difference between calculated theoretical (25.0%) and measured achieved occupancy (1.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 75%                                                                                             
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (25.0%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.07
+    Branch Instructions              inst          223
+    Branch Efficiency                   %        99.11
+    Avg. Divergent Branches                       0.00
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 2.968%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 700 excessive sectors (37% of the total   
+          1888 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)32, (int)32>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       886.67
+    SM Frequency            cycle/nsecond         1.32
+    Elapsed Cycles                  cycle      5156070
+    Memory Throughput                   %        25.52
+    DRAM Throughput                     %        18.14
+    Duration                      msecond         3.89
+    L1/TEX Cache Throughput             %        35.42
+    L2 Cache Throughput                 %        25.52
+    SM Active Cycles                cycle   5001212.85
+    Compute (SM) Throughput             %        12.50
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.52
+    Executed Ipc Elapsed  inst/cycle         0.50
+    Issue Slots Busy               %        12.88
+    Issued Ipc Active     inst/cycle         0.52
+    SM Busy                        %        12.88
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 90.53%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       164.70
+    Mem Busy                     %        25.52
+    Max Bandwidth                %        22.87
+    L1/TEX Hit Rate              %        21.51
+    L2 Hit Rate                  %        76.91
+    Mem Pipes Busy               %         5.15
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.5746%                                                                                         
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          5.1 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 8.9 sectors per request, or 8.9*32 = 283.6 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 5.1 byte accesses would result in 5.1*32 = 164.4 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 15.85%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 1.45%                                                                                           
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        13.04
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        86.96
+    Active Warps Per Scheduler          warp         1.95
+    Eligible Warps Per Scheduler        warp         0.14
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 74.48%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.7 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.95 active warps per scheduler, but only an average of 0.14 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        14.98
+    Warp Cycles Per Executed Instruction           cycle        14.98
+    Avg. Active Threads Per Warp                                17.75
+    Avg. Not Predicated Off Threads Per Warp                    16.40
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 46.6%                                                                                           
+          On average, each warp of this kernel spends 7.0 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 46.6% of the total average of 15.0 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 6.095%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 17.7 threads being active per cycle. This is further reduced    
+          to 16.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    644298.60
+    Executed Instructions                           inst    206175551
+    Avg. Issued Instructions Per Scheduler          inst    644329.69
+    Issued Instructions                             inst    206185502
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             186
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.10
+    Threads                                   thread         1874592
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block            8
+    Block Limit Shared Mem                block            8
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp            8
+    Theoretical Occupancy                     %        12.50
+    Achieved Occupancy                        %        12.12
+    Achieved Active Warps Per SM           warp         7.76
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 74.48%                                                                                          
+          The 2.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (12.5%) is limited by the number of required      
+          registers. This kernel's theoretical occupancy (12.5%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst     15667576
+    Branch Efficiency                   %        30.13
+    Avg. Divergent Branches                   12790.60
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 53.7%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 53746246 excessive sectors (56% of the    
+          total 95432386 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal-fls/crystal_fls_q41_sf10.txt b/results/V100/crystal-fls/crystal_fls_q41_sf10.txt
new file mode 100644
index 0000000..79f379f
--- /dev/null
+++ b/results/V100/crystal-fls/crystal_fls_q41_sf10.txt
@@ -0,0 +1,983 @@
+==PROF== Connected to process 5976 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q41_bitpacked)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_p" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_d" - 3: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 4: 0%....50%....100% - 74 passes
+Result:
+1992 1 103225040658
+1993 1 105193302842
+1994 1 103837804124
+1995 1 103659981621
+1996 1 103616722233
+1997 1 103157005314
+1998 1 61159340206
+1992 2 106678259181
+1993 2 105849020253
+1994 2 106216978529
+1995 2 107035371791
+1996 2 105292362331
+1997 2 105381211263
+1998 2 61616122837
+1992 3 106953585129
+1993 3 106242432020
+1994 3 105405953212
+1995 3 106496045663
+1996 3 106452120723
+1997 3 106618275297
+1998 3 61766210322
+1992 17 103623138817
+1993 17 104974876956
+1994 17 103731557899
+1995 17 103730419480
+1996 17 104874194133
+1997 17 102847514868
+1998 17 61002354487
+1992 24 106223564390
+1993 24 105649036141
+1994 24 106076726307
+1995 24 105177111217
+1996 24 103976579696
+1997 24 104638539353
+1998 24 60962148771
+Res Count: 35
+Time Taken Total: 5667.16
+{"query":41,"time_query":5666.95}
+==PROF== Disconnected from process 5976
+[5976] fls_q41_bitpacked@127.0.0.1
+  void build_hashtable_s<(int)32, (int)32>(int *, int *, int, int *, int) (20, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       516.59
+    SM Frequency            cycle/usecond       769.12
+    Elapsed Cycles                  cycle        11134
+    Memory Throughput                   %         3.11
+    DRAM Throughput                     %         3.11
+    Duration                      usecond        14.46
+    L1/TEX Cache Throughput             %         3.98
+    L2 Cache Throughput                 %         1.17
+    SM Active Cycles                cycle      2255.89
+    Compute (SM) Throughput             %         0.63
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.12
+    Executed Ipc Elapsed  inst/cycle         0.03
+    Issue Slots Busy               %         3.13
+    Issued Ipc Active     inst/cycle         0.13
+    SM Busy                        %         3.13
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.7%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        16.48
+    Mem Busy                     %         1.07
+    Max Bandwidth                %         3.11
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         4.11
+    Mem Pipes Busy               %         0.22
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.44
+    Issued Warp Per Scheduler                        0.12
+    No Eligible                            %        87.56
+    Active Warps Per Scheduler          warp         0.99
+    Eligible Warps Per Scheduler        warp         0.12
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.56%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 8.0 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.99 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         7.94
+    Warp Cycles Per Executed Instruction           cycle         8.00
+    Avg. Active Threads Per Warp                                13.37
+    Avg. Not Predicated Off Threads Per Warp                    12.38
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 42.79%                                                                                          
+          On average, each warp of this kernel spends 3.4 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 42.8% of the total average of 7.9 cycles between issuing two instructions.                              
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.3887%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.4 threads being active per cycle. This is further reduced    
+          to 12.4 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst           70
+    Executed Instructions                           inst        22400
+    Avg. Issued Instructions Per Scheduler          inst        70.52
+    Issued Instructions                             inst        22566
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     20
+    Registers Per Thread             register/thread              72
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.01
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 75%                                                                                             
+          The grid for this launch is configured to execute only 20 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           28
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           28
+    Theoretical Occupancy                     %        43.75
+    Achieved Occupancy                        %         1.56
+    Achieved Active Warps Per SM           warp         1.00
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 87.56%                                                                                          
+          The difference between calculated theoretical (43.8%) and measured achieved occupancy (1.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 56.25%                                                                                          
+          The 7.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (43.8%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst         2027
+    Branch Efficiency                   %        13.94
+    Avg. Divergent Branches                       1.89
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.171%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)32, (int)32>(int *, int *, int *, int, int *, int) (293, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       448.80
+    SM Frequency            cycle/usecond       664.69
+    Elapsed Cycles                  cycle        16053
+    Memory Throughput                   %        50.54
+    DRAM Throughput                     %        50.54
+    Duration                      usecond        24.06
+    L1/TEX Cache Throughput             %        31.57
+    L2 Cache Throughput                 %        20.91
+    SM Active Cycles                cycle     13620.96
+    Compute (SM) Throughput             %         7.18
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.2 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.34
+    Executed Ipc Elapsed  inst/cycle         0.29
+    Issue Slots Busy               %         8.43
+    Issued Ipc Active     inst/cycle         0.34
+    SM Busy                        %         8.43
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.73%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       232.27
+    Mem Busy                     %        20.78
+    Max Bandwidth                %        50.54
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        23.30
+    Mem Pipes Busy               %         3.71
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.1671%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.015%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         9.40
+    Issued Warp Per Scheduler                        0.09
+    No Eligible                            %        90.60
+    Active Warps Per Scheduler          warp         1.00
+    Eligible Warps Per Scheduler        warp         0.09
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 49.46%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 10.6 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.00 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        10.63
+    Warp Cycles Per Executed Instruction           cycle        10.70
+    Avg. Active Threads Per Warp                                13.42
+    Avg. Not Predicated Off Threads Per Warp                    12.58
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.26%                                                                                          
+          On average, each warp of this kernel spends 3.4 cycles being stalled waiting on a fixed latency execution     
+          dependency. Typically, this stall reason should be very low and only shows up as a top contributor in         
+          already highly optimized kernels. Try to hide the corresponding instruction latencies by increasing the       
+          number of active warps, restructuring the code or unrolling loops. Furthermore, consider switching to         
+          lower-latency instructions, e.g. by making use of fast math compiler options. This stall type represents      
+          about 32.3% of the total average of 10.6 cycles between issuing two instructions.                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 4.357%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 13.4 threads being active per cycle. This is further reduced    
+          to 12.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      1140.87
+    Executed Instructions                           inst       365077
+    Avg. Issued Instructions Per Scheduler          inst      1148.19
+    Issued Instructions                             inst       367422
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    293
+    Registers Per Thread             register/thread             108
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            9376
+    Waves Per SM                                                0.23
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %         5.58
+    Achieved Active Warps Per SM           warp         3.57
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 49.46%                                                                                          
+          The difference between calculated theoretical (25.0%) and measured achieved occupancy (5.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 49.46%                                                                                          
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (25.0%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst        30465
+    Branch Efficiency                   %        14.02
+    Avg. Divergent Branches                      28.35
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 27.22%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68196 excessive sectors (34% of the total 
+          202634 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)32, (int)32>(int *, int *, int, int *, int) (782, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       552.31
+    SM Frequency            cycle/usecond       815.95
+    Elapsed Cycles                  cycle        21551
+    Memory Throughput                   %        67.41
+    DRAM Throughput                     %        67.41
+    Duration                      usecond        26.30
+    L1/TEX Cache Throughput             %        37.79
+    L2 Cache Throughput                 %        24.94
+    SM Active Cycles                cycle     18706.75
+    Compute (SM) Throughput             %        13.35
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.61
+    Executed Ipc Elapsed  inst/cycle         0.53
+    Issue Slots Busy               %        15.31
+    Issued Ipc Active     inst/cycle         0.61
+    SM Busy                        %        15.31
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 88.78%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       381.25
+    Mem Busy                     %        23.56
+    Max Bandwidth                %        67.41
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         6.32
+    Mem Pipes Busy               %         4.46
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.89
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.11
+    Active Warps Per Scheduler          warp         2.29
+    Eligible Warps Per Scheduler        warp         0.18
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 32.59%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          2.29 active warps per scheduler, but only an average of 0.18 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        14.40
+    Warp Cycles Per Executed Instruction           cycle        14.44
+    Avg. Active Threads Per Warp                                18.36
+    Avg. Not Predicated Off Threads Per Warp                    16.58
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.59%                                                                                          
+          On average, each warp of this kernel spends 6.5 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 45.1% of the total average of 14.4 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 6.432%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 18.4 threads being active per cycle. This is further reduced    
+          to 16.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      2855.74
+    Executed Instructions                           inst       913837
+    Avg. Issued Instructions Per Scheduler          inst      2864.82
+    Issued Instructions                             inst       916741
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    782
+    Registers Per Thread             register/thread              78
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           25024
+    Waves Per SM                                                0.41
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           24
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           24
+    Theoretical Occupancy                     %        37.50
+    Achieved Occupancy                        %        13.84
+    Achieved Active Warps Per SM           warp         8.86
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.59%                                                                                          
+          The difference between calculated theoretical (37.5%) and measured achieved occupancy (13.8%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 32.59%                                                                                          
+          The 6.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (37.5%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.09
+    Branch Instructions              inst        81308
+    Branch Efficiency                   %        13.98
+    Avg. Divergent Branches                      75.68
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 15.84%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 57145 excessive sectors (19% of the total 
+          308036 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)32, (int)32>(int *, int *, int, int *, int, int) (3, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       345.89
+    SM Frequency            cycle/usecond       516.79
+    Elapsed Cycles                  cycle         7255
+    Memory Throughput                   %         0.90
+    DRAM Throughput                     %         0.90
+    Duration                      usecond        14.02
+    L1/TEX Cache Throughput             %        21.87
+    L2 Cache Throughput                 %         0.77
+    SM Active Cycles                cycle       184.29
+    Compute (SM) Throughput             %         0.10
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.15
+    Executed Ipc Elapsed  inst/cycle         0.00
+    Issue Slots Busy               %         3.80
+    Issued Ipc Active     inst/cycle         0.15
+    SM Busy                        %         3.80
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 97.16%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         3.17
+    Mem Busy                     %         0.49
+    Max Bandwidth                %         0.90
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        40.65
+    Mem Pipes Busy               %         0.09
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.01262%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.1 sectors per request, or 9.1*32 = 290.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.06497%                                                                                        
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.78
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.22
+    Active Warps Per Scheduler          warp         1.01
+    Eligible Warps Per Scheduler        warp         0.16
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 84.22%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.01 active warps per scheduler, but only an average of 0.16 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle         6.43
+    Warp Cycles Per Executed Instruction           cycle         6.51
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    27.49
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst         6.92
+    Executed Instructions                           inst         2215
+    Avg. Issued Instructions Per Scheduler          inst         7.01
+    Issued Instructions                             inst         2243
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      3
+    Registers Per Thread             register/thread              80
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread              96
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 96.25%                                                                                          
+          The grid for this launch is configured to execute only 3 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           24
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           24
+    Theoretical Occupancy                     %        37.50
+    Achieved Occupancy                        %         1.56
+    Achieved Active Warps Per SM           warp         1.00
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 84.22%                                                                                          
+          The difference between calculated theoretical (37.5%) and measured achieved occupancy (1.6%) can be the       
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 62.5%                                                                                           
+          The 6.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (37.5%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.04
+    Branch Instructions              inst           83
+    Branch Efficiency                   %        97.62
+    Avg. Divergent Branches                       0.00
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 4.408%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)32, (int)32>(int *, int *, int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *, int, int *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       866.70
+    SM Frequency            cycle/nsecond         1.29
+    Elapsed Cycles                  cycle      5810347
+    Memory Throughput                   %        28.62
+    DRAM Throughput                     %        28.62
+    Duration                      msecond         4.48
+    L1/TEX Cache Throughput             %        17.08
+    L2 Cache Throughput                 %        16.30
+    SM Active Cycles                cycle   5829008.41
+    Compute (SM) Throughput             %        12.86
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.51
+    Executed Ipc Elapsed  inst/cycle         0.51
+    Issue Slots Busy               %        12.81
+    Issued Ipc Active     inst/cycle         0.51
+    SM Busy                        %        12.81
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 90.98%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       254.00
+    Mem Busy                     %        16.30
+    Max Bandwidth                %        28.62
+    L1/TEX Hit Rate              %        43.84
+    L2 Hit Rate                  %        47.03
+    Mem Pipes Busy               %         5.50
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.5746%                                                                                         
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.5 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 8.0 sectors per request, or 8.0*32 = 254.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.5 byte accesses would result in 4.5*32 = 144.4 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 9.531%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.6 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.3489%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.82
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        87.18
+    Active Warps Per Scheduler          warp         1.95
+    Eligible Warps Per Scheduler        warp         0.14
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 71.38%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.95 active warps per scheduler, but only an average of 0.14 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        15.18
+    Warp Cycles Per Executed Instruction           cycle        15.18
+    Avg. Active Threads Per Warp                                18.35
+    Avg. Not Predicated Off Threads Per Warp                    17.17
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 49.67%                                                                                          
+          On average, each warp of this kernel spends 7.5 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 49.7% of the total average of 15.2 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 5.959%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 18.3 threads being active per cycle. This is further reduced    
+          to 17.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    746492.98
+    Executed Instructions                           inst    238877754
+    Avg. Issued Instructions Per Scheduler          inst    746543.47
+    Issued Instructions                             inst    238893909
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             216
+    Shared Memory Configuration Size           Kbyte           32.77
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.10
+    Threads                                   thread         1874592
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block            8
+    Block Limit Shared Mem                block            8
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp            8
+    Theoretical Occupancy                     %        12.50
+    Achieved Occupancy                        %        12.17
+    Achieved Active Warps Per SM           warp         7.79
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 71.38%                                                                                          
+          The 2.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (12.5%) is limited by the number of required      
+          registers. This kernel's theoretical occupancy (12.5%) is limited by the required amount of shared memory.    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.08
+    Branch Instructions              inst     19892267
+    Branch Efficiency                   %        44.30
+    Avg. Divergent Branches                   13766.62
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 58.15%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 58762179 excessive sectors (59% of the    
+          total 99999883 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal-opt-fls/crystal_opt_fls_q11_sf10.txt b/results/V100/crystal-opt-fls/crystal_opt_fls_q11_sf10.txt
new file mode 100644
index 0000000..412132a
--- /dev/null
+++ b/results/V100/crystal-opt-fls/crystal_opt_fls_q11_sf10.txt
@@ -0,0 +1,178 @@
+==PROF== Connected to process 9356 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/fastlanes/src/fls_q11_bp_crystal_opt)
+==PROF== Profiling "QueryKernelOpt" - 0: 0%....50%....100% - 74 passes
+[33m-- lo_orderdate_min: 19920101[39m
+[33m-- lo_orderdate_max: 19980802[39m
+[33m-- lo_discount_min: 0[39m
+[33m-- lo_discount_max: 10[39m
+[33m-- lo_quantity_min: 1[39m
+[33m-- lo_quantity_max: 50[39m
+[33m-- lo_extendedprice_min: 90097[39m
+[33m-- lo_extendedprice_max: 10494900[39m
+[33m-- x: 16[39m
+[33m-- LOADED DATA[39m
+[33m-- LOADED DATA TO GPU[39m
+[33m-- total_time_taken: 1466.55[39m
+[33m-- revenue: 4471898856447[39m
+[32m-- SF_10[39m
+[1m[34m-- 1466.47[39m
+==PROF== Disconnected from process 9356
+[9356] fls_q11_bp_crystal_opt@127.0.0.1
+  void QueryKernelOpt<(int)32, (int)32>(const int *, const int *, const int *, const int *, fastlanes::ssb::SSB, unsigned long long *) (58581, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       852.91
+    SM Frequency            cycle/nsecond         1.27
+    Elapsed Cycles                  cycle       828175
+    Memory Throughput                   %        66.08
+    DRAM Throughput                     %        66.08
+    Duration                      usecond       649.60
+    L1/TEX Cache Throughput             %        19.66
+    L2 Cache Throughput                 %        23.28
+    SM Active Cycles                cycle    822936.74
+    Compute (SM) Throughput             %        42.44
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.14
+    Executed Ipc Elapsed  inst/cycle         1.13
+    Issue Slots Busy               %        28.45
+    Issued Ipc Active     inst/cycle         1.14
+    SM Busy                        %        42.71
+    -------------------- ----------- ------------
+
+    INF   ALU is the highest-utilized pipeline (42.7%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes integer and logic operations. It is well-utilized, but should not be a    
+          bottleneck.                                                                                                   
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       577.15
+    Mem Busy                     %        23.28
+    Max Bandwidth                %        66.08
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         0.50
+    Mem Pipes Busy               %        18.30
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        28.44
+    Issued Warp Per Scheduler                        0.28
+    No Eligible                            %        71.56
+    Active Warps Per Scheduler          warp         3.89
+    Eligible Warps Per Scheduler        warp         0.40
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 33.92%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 3.5 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          3.89 active warps per scheduler, but only an average of 0.40 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        13.67
+    Warp Cycles Per Executed Instruction           cycle        13.68
+    Avg. Active Threads Per Warp                                31.85
+    Avg. Not Predicated Off Threads Per Warp                    31.70
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 33.92%                                                                                          
+          On average, each warp of this kernel spends 9.1 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 66.8% of the total average of 13.7 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    234142.72
+    Executed Instructions                           inst     74925669
+    Avg. Issued Instructions Per Scheduler          inst    234159.96
+    Issued Instructions                             inst     74931186
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                    32
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                  58581
+    Registers Per Thread             register/thread             120
+    Shared Memory Configuration Size           Kbyte           98.30
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block       Kbyte/block            4.35
+    Threads                                   thread         1874592
+    Waves Per SM                                               45.77
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           22
+    Block Limit Warps                     block           64
+    Theoretical Active Warps per SM        warp           16
+    Theoretical Occupancy                     %           25
+    Achieved Occupancy                        %        24.37
+    Achieved Active Warps Per SM           warp        15.60
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 33.92%                                                                                          
+          The 4.00 theoretical warps per scheduler this kernel can issue according to its occupancy are below the       
+          hardware maximum of 16. This kernel's theoretical occupancy (25.0%) is limited by the number of required      
+          registers.                                                                                                    
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.01
+    Branch Instructions              inst       995884
+    Branch Efficiency                   %          100
+    Avg. Divergent Branches                          0
+    ------------------------- ----------- ------------
+
diff --git a/results/V100/crystal-opt/crystal_opt_q11_sf10.txt b/results/V100/crystal-opt/crystal_opt_q11_sf10.txt
new file mode 100644
index 0000000..ffcc783
--- /dev/null
+++ b/results/V100/crystal-opt/crystal_opt_q11_sf10.txt
@@ -0,0 +1,186 @@
+==PROF== Connected to process 6800 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q11)
+Using device 0: Tesla V100-SXM2-16GB (PTX version 700, SM700, 80 SMs, 15754 free / 16151 total MB physmem, 898.048 GB/s @ 877000 kHz mem clock, ECC on)
+** LOADED DATA **
+LO_LEN 59986214
+** LOADED DATA TO GPU **
+==PROF== Profiling "QueryKernel" - 0: 0%....50%....100% - 74 passes
+Revenue: 4471898856447
+Time Taken Total: 1109.52
+{"query":11,"time_query":1109.46}
+==PROF== Disconnected from process 6800
+[6800] crystal_opt_q11@127.0.0.1
+  void QueryKernel<(int)128, (int)4>(int *, int *, int *, int *, int, unsigned long long *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       872.74
+    SM Frequency            cycle/nsecond         1.30
+    Elapsed Cycles                  cycle       791550
+    Memory Throughput                   %        98.27
+    DRAM Throughput                     %        98.27
+    Duration                      usecond       608.38
+    L1/TEX Cache Throughput             %        26.78
+    L2 Cache Throughput                 %        34.56
+    SM Active Cycles                cycle    787984.97
+    Compute (SM) Throughput             %        44.16
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.64
+    Executed Ipc Elapsed  inst/cycle         1.63
+    Issue Slots Busy               %        40.94
+    Issued Ipc Active     inst/cycle         1.64
+    SM Busy                        %        44.33
+    -------------------- ----------- ------------
+
+    INF   ALU is the highest-utilized pipeline (44.3%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes integer and logic operations. It is well-utilized, but should not be a    
+          bottleneck.                                                                                                   
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       878.21
+    Mem Busy                     %        27.78
+    Max Bandwidth                %        98.27
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         0.87
+    Mem Pipes Busy               %        27.22
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 10.39%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 98.27%                                                                                          
+          The memory access pattern for loads from device memory causes 16,696,536 sectors to be read from DRAM, which  
+          is 1.3x of the 13,301,780 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        40.94
+    Issued Warp Per Scheduler                        0.41
+    No Eligible                            %        59.06
+    Active Warps Per Scheduler          warp        15.47
+    Eligible Warps Per Scheduler        warp         1.22
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 1.731%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 2.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          15.47 active warps per scheduler, but only an average of 1.22 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        37.79
+    Warp Cycles Per Executed Instruction           cycle        37.80
+    Avg. Active Threads Per Warp                                31.86
+    Avg. Not Predicated Off Threads Per Warp                    29.51
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 1.731%                                                                                          
+          On average, each warp of this kernel spends 27.7 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 73.4% of the total average of 37.8 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    322559.64
+    Executed Instructions                           inst    103219085
+    Avg. Issued Instructions Per Scheduler          inst    322593.33
+    Issued Instructions                             inst    103229864
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              22
+    Shared Memory Configuration Size           Kbyte            8.19
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block             256
+    Threads                                   thread        14996608
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        96.95
+    Achieved Active Warps Per SM           warp        62.05
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst     10075882
+    Branch Efficiency                   %          100
+    Avg. Divergent Branches                          0
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 14.17%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1907462 excessive sectors (14% of the     
+          total 13418941 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal-opt/crystal_opt_q21_sf10.txt b/results/V100/crystal-opt/crystal_opt_q21_sf10.txt
new file mode 100644
index 0000000..7bedb38
--- /dev/null
+++ b/results/V100/crystal-opt/crystal_opt_q21_sf10.txt
@@ -0,0 +1,1045 @@
+==PROF== Connected to process 3738 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q21)
+Using device 0: Tesla V100-SXM2-16GB (PTX version 700, SM700, 80 SMs, 15754 free / 16151 total MB physmem, 898.048 GB/s @ 877000 kHz mem clock, ECC on)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 74 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 74 passes
+1992 40 6574868694
+1993 40 6952043914
+1994 40 6525239576
+1995 40 6764559245
+1996 40 6725548424
+1997 40 6596102991
+1998 40 3988851825
+1992 41 7047701749
+1993 41 6909841940
+1994 41 6978800980
+1995 41 7036474627
+1996 41 7233045193
+1997 41 6938053628
+1998 41 4065391978
+1992 42 6450484539
+1993 42 6886094182
+1994 42 6852294265
+1995 42 6749813918
+1996 42 6568551778
+1997 42 6845017761
+1998 42 3773836113
+1992 43 6918393482
+1993 43 6621428714
+1994 43 7068738463
+1995 43 6820930145
+1996 43 6762634261
+1997 43 6849537060
+1998 43 3882704011
+1992 44 6343659176
+1993 44 6094791212
+1994 44 6661136530
+1995 44 6085276694
+1996 44 6176324016
+1997 44 6315911460
+1998 44 3925731952
+1992 45 6499025385
+1993 45 6779833973
+1994 45 6435942251
+1995 45 6738626764
+1996 45 6763207154
+1997 45 6889101910
+1998 45 3879170338
+1992 46 6833102567
+1993 46 7017493760
+1994 46 7015998639
+1995 46 6897957727
+1996 46 6948998143
+1997 46 6510502742
+1998 46 3911656234
+1992 47 6922095842
+1993 47 7061777324
+1994 47 6877252420
+1995 47 6575484550
+1996 47 6517266740
+1997 47 6651228318
+1998 47 3835254989
+1992 48 6818173454
+1993 48 6961952133
+1994 48 7051587760
+1995 48 7329421356
+1996 48 7164243172
+1997 48 7052687209
+1998 48 4132526586
+1992 49 6907633511
+1993 49 6614194460
+1994 49 6773107666
+1995 49 6954065693
+1996 49 6747336514
+1997 49 6947116463
+1998 49 3906763122
+1992 50 7098282117
+1993 50 7263350231
+1994 50 7199754789
+1995 50 7246399314
+1996 50 6860318803
+1997 50 7184653230
+1998 50 4293359981
+1992 51 7474015795
+1993 51 7031859249
+1994 51 6749353264
+1995 51 7395439319
+1996 51 7118371952
+1997 51 7427932834
+1998 51 4080129102
+1992 52 7001985495
+1993 52 6734276751
+1994 52 6965715192
+1995 52 6934765252
+1996 52 6895454124
+1997 52 6802928999
+1998 52 3916065107
+1992 53 6531087764
+1993 53 6258171804
+1994 53 6197787972
+1995 53 6605279401
+1996 53 6722321819
+1997 53 6879971631
+1998 53 3561102555
+1992 54 7041216650
+1993 54 6601732879
+1994 54 6737632272
+1995 54 6483760392
+1996 54 6778740509
+1997 54 6950964366
+1998 54 3960525994
+1992 55 7034325953
+1993 55 7070112383
+1994 55 6835473512
+1995 55 6681873420
+1996 55 6755919599
+1997 55 6883879790
+1998 55 3842444977
+1992 56 6672842875
+1993 56 6362926487
+1994 56 6787572691
+1995 56 6941448166
+1996 56 6349041382
+1997 56 6831022793
+1998 56 3750580610
+1992 57 6762940511
+1993 57 6200194110
+1994 57 6360354225
+1995 57 6799718937
+1996 57 6500504812
+1997 57 6464594869
+1998 57 3690857660
+1992 58 6367358727
+1993 58 6519991362
+1994 58 6228367674
+1995 58 6522760927
+1996 58 6043428578
+1997 58 6386892483
+1998 58 3888948778
+1992 59 6542091138
+1993 59 6669384898
+1994 59 6566921738
+1995 59 6725584633
+1996 59 6678854924
+1997 59 6518974991
+1998 59 3661443815
+1992 60 7397021390
+1993 60 6985315570
+1994 60 7171226221
+1995 60 7409511342
+1996 60 7217054942
+1997 60 7241219598
+1998 60 4134876965
+1992 61 6439487815
+1993 61 6190501096
+1994 61 6658242784
+1995 61 6300444895
+1996 61 6394989839
+1997 61 6372986872
+1998 61 3692782928
+1992 62 7142709582
+1993 62 6575099186
+1994 62 6577906605
+1995 62 6758016505
+1996 62 6713821475
+1997 62 7061699626
+1998 62 3911733232
+1992 63 6684932832
+1993 63 6784872415
+1994 63 6771692541
+1995 63 6832689629
+1996 63 6769695502
+1997 63 6801959247
+1998 63 3916910435
+1992 64 6403427844
+1993 64 6686657397
+1994 64 6560285004
+1995 64 6654877138
+1996 64 6403809726
+1997 64 6364910756
+1998 64 3757788047
+1992 65 6800534485
+1993 65 6932192888
+1994 65 6599703796
+1995 65 6950320978
+1996 65 6745507185
+1997 65 6965554062
+1998 65 3856421228
+1992 66 6608507118
+1993 66 6720022834
+1994 66 7249477139
+1995 66 6982989122
+1996 66 6895681155
+1997 66 7131587724
+1998 66 4050936159
+1992 67 6789994724
+1993 67 7034832635
+1994 67 6533866956
+1995 67 7089400123
+1996 67 6950690822
+1997 67 6872602250
+1998 67 3798832673
+1992 68 6761138392
+1993 68 7117328614
+1994 68 7003067656
+1995 68 6916376148
+1996 68 6810961498
+1997 68 6421432868
+1998 68 4365901362
+1992 69 6333970291
+1993 69 6591672386
+1994 69 6491372066
+1995 69 6759048824
+1996 69 6636341404
+1997 69 6396375726
+1998 69 3755850783
+1992 70 6863351080
+1993 70 7236349480
+1994 70 7065985619
+1995 70 6799040388
+1996 70 7281402064
+1997 70 6735307561
+1998 70 4062655575
+1992 71 6978088606
+1993 71 6615095404
+1994 71 6642491845
+1995 71 7135465638
+1996 71 6904578270
+1997 71 6886861519
+1998 71 3971062487
+1992 72 6077239048
+1993 72 6379459453
+1994 72 6452415472
+1995 72 6170313509
+1996 72 5916688379
+1997 72 5963369350
+1998 72 3683718797
+1992 73 6671048755
+1993 73 6565112476
+1994 73 6641285247
+1995 73 6887663633
+1996 73 6439642020
+1997 73 6675192946
+1998 73 3814007830
+1992 74 6999195521
+1993 74 7007686388
+1994 74 6670519880
+1995 74 6744064671
+1996 74 6614217057
+1997 74 6523268368
+1998 74 4023666133
+1992 75 6627416528
+1993 75 6758016664
+1994 75 6751975322
+1995 75 7047693486
+1996 75 6567430366
+1997 75 6781762704
+1998 75 4063152322
+1992 76 6785625804
+1993 76 6930340135
+1994 76 6382873777
+1995 76 6206415993
+1996 76 6805542040
+1997 76 6422414358
+1998 76 4087738859
+1992 77 6848387744
+1993 77 6623249454
+1994 77 6588036917
+1995 77 6589295276
+1996 77 6603676047
+1997 77 6383121125
+1998 77 4063691471
+1992 78 6240883199
+1993 78 6551226256
+1994 78 6647824791
+1995 78 6494311762
+1996 78 6358269587
+1997 78 6349078074
+1998 78 3890548095
+1992 79 6948601533
+1993 79 7058895576
+1994 79 7280306702
+1995 79 7174749606
+1996 79 7134521672
+1997 79 7009756092
+1998 79 4233289127
+Res Count: 280
+Time Taken Total: 2577.93
+{"query":21,"time_query":2577.8}
+==PROF== Disconnected from process 3738
+[3738] crystal_opt_q21@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       720.80
+    SM Frequency            cycle/nsecond         1.07
+    Elapsed Cycles                  cycle         4696
+    Memory Throughput                   %         7.37
+    DRAM Throughput                     %         7.37
+    Duration                      usecond         4.38
+    L1/TEX Cache Throughput             %         6.32
+    L2 Cache Throughput                 %         2.78
+    SM Active Cycles                cycle         1421
+    Compute (SM) Throughput             %         1.86
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.23
+    Executed Ipc Elapsed  inst/cycle         0.07
+    Issue Slots Busy               %         6.15
+    Issued Ipc Active     inst/cycle         0.25
+    SM Busy                        %         6.15
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.08%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        54.38
+    Mem Busy                     %         2.55
+    Max Bandwidth                %         7.37
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         3.99
+    Mem Pipes Busy               %         0.59
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.34
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.66
+    Active Warps Per Scheduler          warp         1.00
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.63%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 15.8 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.00 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        15.77
+    Warp Cycles Per Executed Instruction           cycle        16.63
+    Avg. Active Threads Per Warp                                16.20
+    Avg. Not Predicated Off Threads Per Warp                    15.19
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 31.03%                                                                                          
+          On average, each warp of this kernel spends 4.9 cycles being stalled waiting for an immediate constant cache  
+          (IMC) miss. A read from constant memory costs one memory read from device memory only on a cache miss;        
+          otherwise, it just costs one read from the constant cache. Immediate constants are encoded into the SASS      
+          instruction as 'c[bank][offset]'. Accesses to different addresses by threads within a warp are serialized,    
+          thus the cost scales linearly with the number of unique addresses read by all threads within a warp. As       
+          such, the constant cache is best when threads in the same warp access only a few distinct locations. If all   
+          threads of a warp access the same location, then constant memory can be as fast as a register access. This    
+          stall type represents about 31.0% of the total average of 15.8 cycles between issuing two instructions.       
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.9793%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.2 threads being active per cycle. This is further reduced    
+          to 15.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        82.96
+    Executed Instructions                           inst        26546
+    Avg. Issued Instructions Per Scheduler          inst        87.46
+    Issued Instructions                             inst        27986
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              16
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 40 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           32
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.09
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 92.63%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       1.47
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.14%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       744.82
+    SM Frequency            cycle/nsecond         1.11
+    Elapsed Cycles                  cycle        22266
+    Memory Throughput                   %        75.85
+    DRAM Throughput                     %        75.85
+    Duration                      usecond        20.06
+    L1/TEX Cache Throughput             %        21.13
+    L2 Cache Throughput                 %        26.67
+    SM Active Cycles                cycle     19529.47
+    Compute (SM) Throughput             %        13.70
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.62
+    Executed Ipc Elapsed  inst/cycle         0.54
+    Issue Slots Busy               %        15.59
+    Issued Ipc Active     inst/cycle         0.62
+    SM Busy                        %        15.59
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 90.81%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       578.50
+    Mem Busy                     %        26.67
+    Max Bandwidth                %        75.85
+    L1/TEX Hit Rate              %         0.01
+    L2 Hit Rate                  %         8.83
+    Mem Pipes Busy               %         6.97
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.607%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.78
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.22
+    Active Warps Per Scheduler          warp        12.15
+    Eligible Warps Per Scheduler        warp         0.27
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 24.15%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.3 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          12.15 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        76.99
+    Warp Cycles Per Executed Instruction           cycle        77.61
+    Avg. Active Threads Per Warp                                16.02
+    Avg. Not Predicated Off Threads Per Warp                    15.64
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 24.15%                                                                                          
+          On average, each warp of this kernel spends 39.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 50.7% of the total average of 77.0 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.002%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.0 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      3019.96
+    Executed Instructions                           inst       966388
+    Avg. Issued Instructions Per Scheduler          inst      3044.49
+    Issued Instructions                             inst       974237
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                1.22
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the    
+          target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical       
+          occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 283 thread blocks.  
+          Under the assumption of a uniform execution duration of all thread blocks, the partial wave may account for   
+          up to 50.0% of the total kernel runtime with a lower occupancy of 24.3%. Try launching a grid with no         
+          partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for  
+          a grid. See the Hardware Model                                                                                
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more      
+          details on launch configurations.                                                                             
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        75.69
+    Achieved Active Warps Per SM           warp        48.44
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 24.15%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (75.7%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.12
+    Branch Instructions              inst       118258
+    Branch Efficiency                   %        72.76
+    Avg. Divergent Branches                      42.58
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.813%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 23924 excessive sectors (7% of the total  
+          360344 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       671.64
+    SM Frequency            cycle/usecond       997.12
+    Elapsed Cycles                  cycle         4279
+    Memory Throughput                   %         1.51
+    DRAM Throughput                     %         1.51
+    Duration                      usecond         4.29
+    L1/TEX Cache Throughput             %        25.60
+    L2 Cache Throughput                 %         1.32
+    SM Active Cycles                cycle       157.47
+    Compute (SM) Throughput             %         0.22
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.22
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         5.88
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         5.88
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.86%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        10.37
+    Mem Busy                     %         0.84
+    Max Bandwidth                %         1.51
+    L1/TEX Hit Rate              %         5.21
+    L2 Hit Rate                  %        41.57
+    Mem Pipes Busy               %         0.16
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.02194%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.2 sectors per request, or 9.2*32 = 293.6 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.11%                                                                                           
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.11
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.89
+    Active Warps Per Scheduler          warp         0.99
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.89%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 16.4 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.99 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        16.25
+    Warp Cycles Per Executed Instruction           cycle        17.30
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    28.35
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst         8.69
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst         9.26
+    Issued Instructions                             inst         2962
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              30
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 93.75%                                                                                          
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.09
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 93.89%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.883%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       873.75
+    SM Frequency            cycle/nsecond         1.30
+    Elapsed Cycles                  cycle      1531013
+    Memory Throughput                   %        92.86
+    DRAM Throughput                     %        92.86
+    Duration                      msecond         1.17
+    L1/TEX Cache Throughput             %        52.19
+    L2 Cache Throughput                 %        44.12
+    SM Active Cycles                cycle   1528368.90
+    Compute (SM) Throughput             %        34.55
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.38
+    Executed Ipc Elapsed  inst/cycle         1.38
+    Issue Slots Busy               %        34.61
+    Issued Ipc Active     inst/cycle         1.38
+    SM Busy                        %        34.61
+    -------------------- ----------- ------------
+
+    INF   ALU is the highest-utilized pipeline (24.1%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes integer and logic operations. It is well-utilized, but should not be a    
+          bottleneck.                                                                                                   
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       830.81
+    Mem Busy                     %        44.12
+    Max Bandwidth                %        92.86
+    L1/TEX Hit Rate              %        54.53
+    L2 Hit Rate                  %        47.53
+    Mem Pipes Busy               %        10.29
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 2.223%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.9 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.8 sectors per request, or 9.8*32 = 313.9 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.9 byte accesses would result in 4.9*32 = 158.1 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 28.23%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.7841%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 50.52%                                                                                          
+          The memory access pattern for loads from device memory causes 30,456,682 sectors to be read from DRAM, which  
+          is 1.4x of the 21,645,747 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        34.63
+    Issued Warp Per Scheduler                        0.35
+    No Eligible                            %        65.37
+    Active Warps Per Scheduler          warp        14.34
+    Eligible Warps Per Scheduler        warp         0.63
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 7.143%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 2.9 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          14.34 active warps per scheduler, but only an average of 0.63 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        41.42
+    Warp Cycles Per Executed Instruction           cycle        41.42
+    Avg. Active Threads Per Warp                                14.38
+    Avg. Not Predicated Off Threads Per Warp                    12.49
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.143%                                                                                          
+          On average, each warp of this kernel spends 34.0 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 82.1% of the total average of 41.4 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 21.07%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 14.4 threads being active per cycle. This is further reduced    
+          to 12.5 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    528865.90
+    Executed Instructions                           inst    169237087
+    Avg. Issued Instructions Per Scheduler          inst    528913.22
+    Issued Instructions                             inst    169252229
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              29
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        89.67
+    Achieved Active Warps Per SM           warp        57.39
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.143%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (89.7%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst     16750526
+    Branch Efficiency                   %        66.51
+    Avg. Divergent Branches                    7113.95
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 73.2%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 65007931 excessive sectors (73% of the    
+          total 88620164 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal-opt/crystal_opt_q31_sf10.txt b/results/V100/crystal-opt/crystal_opt_q31_sf10.txt
new file mode 100644
index 0000000..af5dcb8
--- /dev/null
+++ b/results/V100/crystal-opt/crystal_opt_q31_sf10.txt
@@ -0,0 +1,926 @@
+==PROF== Connected to process 5983 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q31)
+Using device 0: Tesla V100-SXM2-16GB (PTX version 700, SM700, 80 SMs, 15754 free / 16151 total MB physmem, 898.048 GB/s @ 877000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 74 passes
+Result:
+1992 8 8 53664098547
+1993 8 8 53051563726
+1994 8 8 53551966681
+1995 8 8 53338395993
+1996 8 8 53781237952
+1997 8 8 53558132271
+1992 9 8 55867859815
+1993 9 8 55345162638
+1994 9 8 55589883121
+1995 9 8 54871630692
+1996 9 8 55620205618
+1997 9 8 54852742519
+1992 12 8 52867359425
+1993 12 8 53435367523
+1994 12 8 52283824959
+1995 12 8 52956472988
+1996 12 8 52948768521
+1997 12 8 52962165616
+1992 18 8 53592522758
+1993 18 8 52996000810
+1994 18 8 52962120320
+1995 18 8 53924104344
+1996 18 8 53634737856
+1997 18 8 54307983851
+1992 21 8 53816502394
+1993 21 8 54349264842
+1994 21 8 54119359035
+1995 21 8 53961984627
+1996 21 8 54294333705
+1997 21 8 53703384515
+1992 8 9 55444214883
+1993 8 9 55740793389
+1994 8 9 55137400588
+1995 8 9 55784172640
+1996 8 9 56378453713
+1997 8 9 55399009353
+1992 9 9 57271740148
+1993 9 9 58216495642
+1994 9 9 57507217082
+1995 9 9 57860170696
+1996 9 9 58662284841
+1997 9 9 56940173344
+1992 12 9 55432874858
+1993 12 9 55398755151
+1994 12 9 55206960389
+1995 12 9 55581754250
+1996 12 9 55487324569
+1997 12 9 53582297974
+1992 18 9 56370007920
+1993 18 9 56166403334
+1994 18 9 55432079732
+1995 18 9 55973419507
+1996 18 9 56254723722
+1997 18 9 55830709236
+1992 21 9 56359335068
+1993 21 9 56885558074
+1994 21 9 56507097670
+1995 21 9 57465742525
+1996 21 9 56177166557
+1997 21 9 56333135444
+1992 8 12 51295873912
+1993 8 12 52384079867
+1994 8 12 52254716872
+1995 8 12 51669051730
+1996 8 12 52670597733
+1997 8 12 53782563068
+1992 9 12 54255769995
+1993 9 12 53477912258
+1994 9 12 53868848846
+1995 9 12 54310027205
+1996 9 12 55409865859
+1997 9 12 54099065304
+1992 12 12 52584065821
+1993 12 12 52637339531
+1994 12 12 50154194273
+1995 12 12 51904425056
+1996 12 12 52493537142
+1997 12 12 50634790895
+1992 18 12 52896145835
+1993 18 12 53112435531
+1994 18 12 52021625515
+1995 18 12 52031180987
+1996 18 12 53022298730
+1997 18 12 53294469049
+1992 21 12 53284643553
+1993 21 12 53900783410
+1994 21 12 53648011682
+1995 21 12 53376554374
+1996 21 12 52174060166
+1997 21 12 52785883863
+1992 8 18 51873441494
+1993 8 18 51961213538
+1994 8 18 52868608376
+1995 8 18 52738284867
+1996 8 18 51678789303
+1997 8 18 51787339279
+1992 9 18 53893325353
+1993 9 18 54178339670
+1994 9 18 54059232642
+1995 9 18 53920766480
+1996 9 18 54128092218
+1997 9 18 54349079982
+1992 12 18 51449505308
+1993 12 18 51384752707
+1994 12 18 52195482938
+1995 12 18 51205040497
+1996 12 18 51165908280
+1997 12 18 52167794260
+1992 18 18 53246367726
+1993 18 18 52211194809
+1994 18 18 52388807873
+1995 18 18 52459889035
+1996 18 18 53737304610
+1997 18 18 52772297391
+1992 21 18 53752784633
+1993 21 18 53723459056
+1994 21 18 52734575706
+1995 21 18 52810670641
+1996 21 18 53606892262
+1997 21 18 52841307001
+1992 8 21 49589186930
+1993 8 21 50874540178
+1994 8 21 50484052905
+1995 8 21 50476123376
+1996 8 21 51102099810
+1997 8 21 51376581082
+1992 9 21 51183086614
+1993 9 21 51849557513
+1994 9 21 51912335762
+1995 9 21 51737313715
+1996 9 21 52987320706
+1997 9 21 51870436294
+1992 12 21 49502367103
+1993 12 21 49962826767
+1994 12 21 50112754286
+1995 12 21 48732674673
+1996 12 21 50123146827
+1997 12 21 49094088315
+1992 18 21 50957655153
+1993 18 21 50627753769
+1994 18 21 50537890156
+1995 18 21 50265160335
+1996 18 21 50774431442
+1997 18 21 51103107061
+1992 21 21 49934446612
+1993 21 21 51562382531
+1994 21 21 50180119681
+1995 21 21 51221558310
+1996 21 21 50423672514
+1997 21 21 50461561884
+Res Count: 150
+Time Taken Total: 2719.94
+{"query":31,"time_query":2719.79}
+==PROF== Disconnected from process 5983
+[5983] crystal_opt_q31@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       693.33
+    SM Frequency            cycle/nsecond         1.03
+    Elapsed Cycles                  cycle         4970
+    Memory Throughput                   %        10.94
+    DRAM Throughput                     %        10.94
+    Duration                      usecond         4.80
+    L1/TEX Cache Throughput             %        10.85
+    L2 Cache Throughput                 %         4.73
+    SM Active Cycles                cycle      1559.39
+    Compute (SM) Throughput             %         1.93
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.23
+    Executed Ipc Elapsed  inst/cycle         0.07
+    Issue Slots Busy               %         6.16
+    Issued Ipc Active     inst/cycle         0.25
+    SM Busy                        %         6.16
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.17%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        77.67
+    Mem Busy                     %         4.49
+    Max Bandwidth                %        10.94
+    L1/TEX Hit Rate              %         0.36
+    L2 Hit Rate                  %        23.82
+    Mem Pipes Busy               %         0.88
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.03344%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.7 sectors per request, or 4.7*32 = 152.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.433%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.47
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.53
+    Active Warps Per Scheduler          warp         0.99
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 89.06%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 15.4 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.99 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        15.34
+    Warp Cycles Per Executed Instruction           cycle        16.10
+    Avg. Active Threads Per Warp                                16.61
+    Avg. Not Predicated Off Threads Per Warp                    15.69
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 33.24%                                                                                          
+          On average, each warp of this kernel spends 5.1 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 33.2% of the total average of 15.3 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.9857%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.6 threads being active per cycle. This is further reduced    
+          to 15.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        91.55
+    Executed Instructions                           inst        29295
+    Avg. Issued Instructions Per Scheduler          inst        96.05
+    Issued Instructions                             inst        30735
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 40 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.12
+    Achieved Active Warps Per SM           warp         3.91
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 89.06%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         3201
+    Branch Efficiency                   %        63.81
+    Avg. Divergent Branches                       1.47
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 12.25%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 4534 excessive sectors (34% of the total  
+          13506 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.     
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       765.16
+    SM Frequency            cycle/nsecond         1.13
+    Elapsed Cycles                  cycle        13498
+    Memory Throughput                   %        60.24
+    DRAM Throughput                     %        60.24
+    Duration                      usecond        11.87
+    L1/TEX Cache Throughput             %        37.76
+    L2 Cache Throughput                 %        25.04
+    SM Active Cycles                cycle     10896.27
+    Compute (SM) Throughput             %        10.28
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.50
+    Executed Ipc Elapsed  inst/cycle         0.40
+    Issue Slots Busy               %        12.70
+    Issued Ipc Active     inst/cycle         0.51
+    SM Busy                        %        12.70
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.82%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       471.98
+    Mem Busy                     %        24.80
+    Max Bandwidth                %        60.24
+    L1/TEX Hit Rate              %         0.20
+    L2 Hit Rate                  %        23.28
+    Mem Pipes Busy               %         4.79
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.2042%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.8 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.407%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.89
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        87.11
+    Active Warps Per Scheduler          warp         5.91
+    Eligible Warps Per Scheduler        warp         0.17
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 39.76%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          5.91 active warps per scheduler, but only an average of 0.17 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        45.88
+    Warp Cycles Per Executed Instruction           cycle        46.84
+    Avg. Active Threads Per Warp                                16.44
+    Avg. Not Predicated Off Threads Per Warp                    15.57
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 39.76%                                                                                          
+          On average, each warp of this kernel spends 23.9 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 52.2% of the total average of 45.9 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 5.28%                                                                                           
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.4 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      1355.03
+    Executed Instructions                           inst       433611
+    Avg. Issued Instructions Per Scheduler          inst      1383.28
+    Issued Instructions                             inst       442651
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                0.46
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        36.53
+    Achieved Active Warps Per SM           warp        23.38
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 39.76%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (36.5%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46902
+    Branch Efficiency                   %        62.57
+    Avg. Divergent Branches                      21.97
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 26.3%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 68642 excessive sectors (34% of the total 
+          203290 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       633.06
+    SM Frequency            cycle/usecond       946.20
+    Elapsed Cycles                  cycle         4695
+    Memory Throughput                   %         1.28
+    DRAM Throughput                     %         1.28
+    Duration                      usecond         4.96
+    L1/TEX Cache Throughput             %        19.70
+    L2 Cache Throughput                 %         1.06
+    SM Active Cycles                cycle       178.79
+    Compute (SM) Throughput             %         0.26
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.26
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         6.77
+    Issued Ipc Active     inst/cycle         0.27
+    SM Busy                        %         6.77
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.47%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         8.28
+    Mem Busy                     %         0.69
+    Max Bandwidth                %         1.28
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %        40.14
+    Mem Pipes Busy               %         0.12
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.01775%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.2 sectors per request, or 9.2*32 = 294.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.08701%                                                                                        
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.4 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.62
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        93.38
+    Active Warps Per Scheduler          warp         0.95
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.38%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 15.1 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.95 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        14.40
+    Warp Cycles Per Executed Instruction           cycle        15.11
+    Avg. Active Threads Per Warp                                31.82
+    Avg. Not Predicated Off Threads Per Warp                    28.27
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        11.54
+    Executed Instructions                           inst         3693
+    Avg. Issued Instructions Per Scheduler          inst        12.10
+    Issued Instructions                             inst         3873
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              21
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 93.75%                                                                                          
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.16
+    Achieved Active Warps Per SM           warp         3.94
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 93.38%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.2%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst          373
+    Branch Efficiency                   %        97.40
+    Avg. Divergent Branches                       0.02
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.364%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 700 excessive sectors (37% of the total   
+          1888 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       871.41
+    SM Frequency            cycle/nsecond         1.30
+    Elapsed Cycles                  cycle      3621310
+    Memory Throughput                   %        33.95
+    DRAM Throughput                     %        30.39
+    Duration                      msecond         2.78
+    L1/TEX Cache Throughput             %        48.24
+    L2 Cache Throughput                 %        33.95
+    SM Active Cycles                cycle   3632551.35
+    Compute (SM) Throughput             %        19.38
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.77
+    Executed Ipc Elapsed  inst/cycle         0.78
+    Issue Slots Busy               %        19.31
+    Issued Ipc Active     inst/cycle         0.77
+    SM Busy                        %        19.31
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 86.41%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       271.15
+    Mem Busy                     %        33.95
+    Max Bandwidth                %        30.60
+    L1/TEX Hit Rate              %        27.06
+    L2 Hit Rate                  %        75.23
+    Mem Pipes Busy               %         6.08
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.5342%                                                                                         
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          5.4 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 7.5 sectors per request, or 7.5*32 = 240.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 5.4 byte accesses would result in 5.4*32 = 174.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 21.11%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.064%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        19.31
+    Issued Warp Per Scheduler                        0.19
+    No Eligible                            %        80.69
+    Active Warps Per Scheduler          warp        14.54
+    Eligible Warps Per Scheduler        warp         0.38
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 66.05%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 5.2 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          14.54 active warps per scheduler, but only an average of 0.38 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        75.31
+    Warp Cycles Per Executed Instruction           cycle        75.32
+    Avg. Active Threads Per Warp                                12.81
+    Avg. Not Predicated Off Threads Per Warp                    11.20
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 57.93%                                                                                          
+          On average, each warp of this kernel spends 43.6 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 57.9% of the total average of 75.3 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 31.66%                                                                                          
+          On average, each warp of this kernel spends 23.8 cycles being stalled waiting for the L1 instruction queue    
+          for local and global (LG) memory operations to be not full. Typically, this stall occurs only when executing  
+          local or global memory instructions extremely frequently. Avoid redundant global memory accesses. Try to      
+          avoid using thread-local memory by checking if dynamically indexed arrays are declared in local scope, of if  
+          the kernel has excessive register pressure causing by spills. If applicable, consider combining multiple      
+          lower-width memory operations into fewer wider memory operations and try interleaving memory operations and   
+          math instructions. This stall type represents about 31.7% of the total average of 75.3 cycles between         
+          issuing two instructions.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 12.6%                                                                                           
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 12.8 threads being active per cycle. This is further reduced    
+          to 11.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    701430.46
+    Executed Instructions                           inst    224457748
+    Avg. Issued Instructions Per Scheduler          inst    701490.44
+    Issued Instructions                             inst    224476942
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              32
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.04
+    Achieved Active Warps Per SM           warp        58.26
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst     21548967
+    Branch Efficiency                   %        55.37
+    Avg. Divergent Branches                   11603.16
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 62.08%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 58809051 excessive sectors (62% of the    
+          total 95109570 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal-opt/crystal_opt_q41_sf10_v100.txt b/results/V100/crystal-opt/crystal_opt_q41_sf10_v100.txt
new file mode 100644
index 0000000..e778d6c
--- /dev/null
+++ b/results/V100/crystal-opt/crystal_opt_q41_sf10_v100.txt
@@ -0,0 +1,987 @@
+==PROF== Connected to process 2898 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal-opt/src/crystal_opt_q41)
+Using device 0: Tesla V100-SXM2-16GB (PTX version 700, SM700, 80 SMs, 15754 free / 16151 total MB physmem, 898.048 GB/s @ 877000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_p" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_d" - 3: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 4: 0%....50%....100% - 74 passes
+Result:
+1992 1 103225040658
+1993 1 105193302842
+1994 1 103837804124
+1995 1 103659981621
+1996 1 103616722233
+1997 1 103157005314
+1998 1 61159340206
+1992 2 106678259181
+1993 2 105849020253
+1994 2 106216978529
+1995 2 107035371791
+1996 2 105292362331
+1997 2 105381211263
+1998 2 61616122837
+1992 3 106953585129
+1993 3 106242432020
+1994 3 105405953212
+1995 3 106496045663
+1996 3 106452120723
+1997 3 106618275297
+1998 3 61766210322
+1992 17 103623138817
+1993 17 104974876956
+1994 17 103731557899
+1995 17 103730419480
+1996 17 104874194133
+1997 17 102847514868
+1998 17 61002354487
+1992 24 106223564390
+1993 24 105649036141
+1994 24 106076726307
+1995 24 105177111217
+1996 24 103976579696
+1997 24 104638539353
+1998 24 60962148771
+Res Count: 35
+Time Taken Total: 3350.47
+{"query":41,"time_query":3350.34}
+==PROF== Disconnected from process 2898
+[2898] crystal_opt_q41@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       699.28
+    SM Frequency            cycle/nsecond         1.05
+    Elapsed Cycles                  cycle         4629
+    Memory Throughput                   %         7.54
+    DRAM Throughput                     %         7.54
+    Duration                      usecond         4.42
+    L1/TEX Cache Throughput             %         6.20
+    L2 Cache Throughput                 %         2.82
+    SM Active Cycles                cycle      1447.67
+    Compute (SM) Throughput             %         1.89
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.23
+    Executed Ipc Elapsed  inst/cycle         0.07
+    Issue Slots Busy               %         6.04
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         6.04
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.16%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        53.97
+    Mem Busy                     %         2.59
+    Max Bandwidth                %         7.54
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         3.95
+    Mem Pipes Busy               %         0.60
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.17
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.83
+    Active Warps Per Scheduler          warp         0.99
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.46%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 16.2 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.99 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        16.04
+    Warp Cycles Per Executed Instruction           cycle        16.91
+    Avg. Active Threads Per Warp                                16.20
+    Avg. Not Predicated Off Threads Per Warp                    15.19
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 30.34%                                                                                          
+          On average, each warp of this kernel spends 4.9 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 30.3% of the total average of 16.0 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.9942%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.2 threads being active per cycle. This is further reduced    
+          to 15.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        82.96
+    Executed Instructions                           inst        26546
+    Avg. Issued Instructions Per Scheduler          inst        87.46
+    Issued Instructions                             inst        27986
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              16
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 40 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           32
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.11
+    Achieved Active Warps Per SM           warp         3.91
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 92.46%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       1.47
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 7.17%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       766.13
+    SM Frequency            cycle/nsecond         1.13
+    Elapsed Cycles                  cycle        13549
+    Memory Throughput                   %        59.85
+    DRAM Throughput                     %        59.85
+    Duration                      usecond        11.90
+    L1/TEX Cache Throughput             %        37.41
+    L2 Cache Throughput                 %        24.91
+    SM Active Cycles                cycle     10786.06
+    Compute (SM) Throughput             %        10.22
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.5 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.50
+    Executed Ipc Elapsed  inst/cycle         0.40
+    Issue Slots Busy               %        12.79
+    Issued Ipc Active     inst/cycle         0.51
+    SM Busy                        %        12.79
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.74%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       469.52
+    Mem Busy                     %        24.72
+    Max Bandwidth                %        59.85
+    L1/TEX Hit Rate              %         0.25
+    L2 Hit Rate                  %        23.18
+    Mem Pipes Busy               %         4.77
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.2005%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.397%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        13.01
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        86.99
+    Active Warps Per Scheduler          warp         6.07
+    Eligible Warps Per Scheduler        warp         0.17
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 40.15%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.7 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          6.07 active warps per scheduler, but only an average of 0.17 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        46.66
+    Warp Cycles Per Executed Instruction           cycle        47.51
+    Avg. Active Threads Per Warp                                16.40
+    Avg. Not Predicated Off Threads Per Warp                    15.53
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 40.15%                                                                                          
+          On average, each warp of this kernel spends 22.9 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 49.0% of the total average of 46.7 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 5.257%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.4 threads being active per cycle. This is further reduced    
+          to 15.5 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      1354.60
+    Executed Instructions                           inst       433471
+    Avg. Issued Instructions Per Scheduler          inst      1379.16
+    Issued Instructions                             inst       441330
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                0.46
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        36.73
+    Achieved Active Warps Per SM           warp        23.51
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 40.15%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (36.7%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46897
+    Branch Efficiency                   %        62.60
+    Avg. Divergent Branches                      21.95
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 26.8%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 68196 excessive sectors (34% of the total 
+          202634 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       691.37
+    SM Frequency            cycle/nsecond         1.02
+    Elapsed Cycles                  cycle        20082
+    Memory Throughput                   %        72.12
+    DRAM Throughput                     %        72.12
+    Duration                      usecond        19.65
+    L1/TEX Cache Throughput             %        40.55
+    L2 Cache Throughput                 %        26.87
+    SM Active Cycles                cycle     17693.59
+    Compute (SM) Throughput             %        18.09
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.81
+    Executed Ipc Elapsed  inst/cycle         0.72
+    Issue Slots Busy               %        20.45
+    Issued Ipc Active     inst/cycle         0.82
+    SM Busy                        %        20.45
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 86.31%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       510.60
+    Mem Busy                     %        25.37
+    Max Bandwidth                %        72.12
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         5.07
+    Mem Pipes Busy               %         5.47
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        20.63
+    Issued Warp Per Scheduler                        0.21
+    No Eligible                            %        79.37
+    Active Warps Per Scheduler          warp        11.24
+    Eligible Warps Per Scheduler        warp         0.31
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 27.88%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          11.24 active warps per scheduler, but only an average of 0.31 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        54.48
+    Warp Cycles Per Executed Instruction           cycle        54.85
+    Avg. Active Threads Per Warp                                20.83
+    Avg. Not Predicated Off Threads Per Warp                    19.27
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 27.88%                                                                                          
+          On average, each warp of this kernel spends 28.9 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 53.1% of the total average of 54.5 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.2%                                                                                            
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.8 threads being active per cycle. This is further reduced    
+          to 19.3 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      3594.60
+    Executed Instructions                           inst      1150272
+    Avg. Issued Instructions Per Scheduler          inst      3618.90
+    Issued Instructions                             inst      1158048
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              19
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                1.22
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the    
+          target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical       
+          occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 283 thread blocks.  
+          Under the assumption of a uniform execution duration of all thread blocks, the partial wave may account for   
+          up to 50.0% of the total kernel runtime with a lower occupancy of 29.8%. Try launching a grid with no         
+          partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for  
+          a grid. See the Hardware Model                                                                                
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more      
+          details on launch configurations.                                                                             
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        70.24
+    Achieved Active Warps Per SM           warp        44.96
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 27.88%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (70.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst       125048
+    Branch Efficiency                   %        62.52
+    Avg. Divergent Branches                      58.60
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 15.71%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 57145 excessive sectors (19% of the total 
+          308036 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       636.03
+    SM Frequency            cycle/usecond       959.56
+    Elapsed Cycles                  cycle         4183
+    Memory Throughput                   %         1.57
+    DRAM Throughput                     %         1.57
+    Duration                      usecond         4.35
+    L1/TEX Cache Throughput             %        25.99
+    L2 Cache Throughput                 %         1.35
+    SM Active Cycles                cycle       155.09
+    Compute (SM) Throughput             %         0.22
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.22
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         5.97
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         5.97
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.79%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        10.22
+    Mem Busy                     %         0.86
+    Max Bandwidth                %         1.57
+    L1/TEX Hit Rate              %         3.49
+    L2 Hit Rate                  %        41.53
+    Mem Pipes Busy               %         0.16
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.022%                                                                                          
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.2 sectors per request, or 9.2*32 = 292.8 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1128%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.11
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.89
+    Active Warps Per Scheduler          warp         1.00
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.89%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 16.4 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.00 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        16.33
+    Warp Cycles Per Executed Instruction           cycle        17.38
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    28.35
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst         8.69
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst         9.26
+    Issued Instructions                             inst         2962
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              30
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 93.75%                                                                                          
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.06
+    Achieved Active Warps Per SM           warp         3.88
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 93.89%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.672%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       875.55
+    SM Frequency            cycle/nsecond         1.31
+    Elapsed Cycles                  cycle      1996250
+    Memory Throughput                   %        74.64
+    DRAM Throughput                     %        74.64
+    Duration                      msecond         1.53
+    L1/TEX Cache Throughput             %        44.40
+    L2 Cache Throughput                 %        36.90
+    SM Active Cycles                cycle   1988025.25
+    Compute (SM) Throughput             %        38.71
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.55
+    Executed Ipc Elapsed  inst/cycle         1.55
+    Issue Slots Busy               %        38.84
+    Issued Ipc Active     inst/cycle         1.55
+    SM Busy                        %        38.84
+    -------------------- ----------- ------------
+
+    INF   FMA is the highest-utilized pipeline (25.9%) based on active cycles, taking into account the rates of its     
+          different instructions. It executes 32-bit floating point (FADD, FMUL, FMAD, ...) and integer (IMUL, IMAD)    
+          operations. It is well-utilized, but should not be a bottleneck.                                              
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       669.18
+    Mem Busy                     %        36.90
+    Max Bandwidth                %        74.64
+    L1/TEX Hit Rate              %        52.71
+    L2 Hit Rate                  %        49.33
+    Mem Pipes Busy               %        11.88
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.172%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.7 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 7.0 sectors per request, or 7.0*32 = 225.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.7 byte accesses would result in 4.7*32 = 151.2 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 22.93%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.4 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 1.016%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 40.17%                                                                                          
+          The memory access pattern for loads from device memory causes 31,893,130 sectors to be read from DRAM, which  
+          is 1.4x of the 22,778,318 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        38.83
+    Issued Warp Per Scheduler                        0.39
+    No Eligible                            %        61.17
+    Active Warps Per Scheduler          warp        14.38
+    Eligible Warps Per Scheduler        warp         0.82
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 25.36%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 2.6 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          14.38 active warps per scheduler, but only an average of 0.82 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        37.03
+    Warp Cycles Per Executed Instruction           cycle        37.03
+    Avg. Active Threads Per Warp                                10.89
+    Avg. Not Predicated Off Threads Per Warp                     9.49
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 25.36%                                                                                          
+          On average, each warp of this kernel spends 28.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 75.9% of the total average of 37.0 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 27.23%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 10.9 threads being active per cycle. This is further reduced    
+          to 9.5 threads per warp due to predication. The compiler may use predication to avoid an actual branch.       
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    772031.52
+    Executed Instructions                           inst    247050086
+    Avg. Issued Instructions Per Scheduler          inst    772102.56
+    Issued Instructions                             inst    247072819
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              32
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        89.97
+    Achieved Active Warps Per SM           warp        57.58
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 10.03%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (90.0%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst     26599495
+    Branch Efficiency                   %        64.43
+    Avg. Divergent Branches                   12061.69
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 69.12%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 63822318 excessive sectors (69% of the    
+          total 92036223 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source       
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal/crystal_q21_sf10.txt b/results/V100/crystal/crystal_q21_sf10.txt
new file mode 100644
index 0000000..23e9fd5
--- /dev/null
+++ b/results/V100/crystal/crystal_q21_sf10.txt
@@ -0,0 +1,1056 @@
+==PROF== Connected to process 4547 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal/src/crystal_q21)
+Using device 0: Tesla V100-SXM2-16GB (PTX version 700, SM700, 80 SMs, 15754 free / 16151 total MB physmem, 898.048 GB/s @ 877000 kHz mem clock, ECC on)
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_p" - 1: 0%....50%....100% - 74 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 74 passes
+1992 40 6574868694
+1993 40 6952043914
+1994 40 6525239576
+1995 40 6764559245
+1996 40 6725548424
+1997 40 6596102991
+1998 40 3988851825
+1992 41 7047701749
+1993 41 6909841940
+1994 41 6978800980
+1995 41 7036474627
+1996 41 7233045193
+1997 41 6938053628
+1998 41 4065391978
+1992 42 6450484539
+1993 42 6886094182
+1994 42 6852294265
+1995 42 6749813918
+1996 42 6568551778
+1997 42 6845017761
+1998 42 3773836113
+1992 43 6918393482
+1993 43 6621428714
+1994 43 7068738463
+1995 43 6820930145
+1996 43 6762634261
+1997 43 6849537060
+1998 43 3882704011
+1992 44 6343659176
+1993 44 6094791212
+1994 44 6661136530
+1995 44 6085276694
+1996 44 6176324016
+1997 44 6315911460
+1998 44 3925731952
+1992 45 6499025385
+1993 45 6779833973
+1994 45 6435942251
+1995 45 6738626764
+1996 45 6763207154
+1997 45 6889101910
+1998 45 3879170338
+1992 46 6833102567
+1993 46 7017493760
+1994 46 7015998639
+1995 46 6897957727
+1996 46 6948998143
+1997 46 6510502742
+1998 46 3911656234
+1992 47 6922095842
+1993 47 7061777324
+1994 47 6877252420
+1995 47 6575484550
+1996 47 6517266740
+1997 47 6651228318
+1998 47 3835254989
+1992 48 6818173454
+1993 48 6961952133
+1994 48 7051587760
+1995 48 7329421356
+1996 48 7164243172
+1997 48 7052687209
+1998 48 4132526586
+1992 49 6907633511
+1993 49 6614194460
+1994 49 6773107666
+1995 49 6954065693
+1996 49 6747336514
+1997 49 6947116463
+1998 49 3906763122
+1992 50 7098282117
+1993 50 7263350231
+1994 50 7199754789
+1995 50 7246399314
+1996 50 6860318803
+1997 50 7184653230
+1998 50 4293359981
+1992 51 7474015795
+1993 51 7031859249
+1994 51 6749353264
+1995 51 7395439319
+1996 51 7118371952
+1997 51 7427932834
+1998 51 4080129102
+1992 52 7001985495
+1993 52 6734276751
+1994 52 6965715192
+1995 52 6934765252
+1996 52 6895454124
+1997 52 6802928999
+1998 52 3916065107
+1992 53 6531087764
+1993 53 6258171804
+1994 53 6197787972
+1995 53 6605279401
+1996 53 6722321819
+1997 53 6879971631
+1998 53 3561102555
+1992 54 7041216650
+1993 54 6601732879
+1994 54 6737632272
+1995 54 6483760392
+1996 54 6778740509
+1997 54 6950964366
+1998 54 3960525994
+1992 55 7034325953
+1993 55 7070112383
+1994 55 6835473512
+1995 55 6681873420
+1996 55 6755919599
+1997 55 6883879790
+1998 55 3842444977
+1992 56 6672842875
+1993 56 6362926487
+1994 56 6787572691
+1995 56 6941448166
+1996 56 6349041382
+1997 56 6831022793
+1998 56 3750580610
+1992 57 6762940511
+1993 57 6200194110
+1994 57 6360354225
+1995 57 6799718937
+1996 57 6500504812
+1997 57 6464594869
+1998 57 3690857660
+1992 58 6367358727
+1993 58 6519991362
+1994 58 6228367674
+1995 58 6522760927
+1996 58 6043428578
+1997 58 6386892483
+1998 58 3888948778
+1992 59 6542091138
+1993 59 6669384898
+1994 59 6566921738
+1995 59 6725584633
+1996 59 6678854924
+1997 59 6518974991
+1998 59 3661443815
+1992 60 7397021390
+1993 60 6985315570
+1994 60 7171226221
+1995 60 7409511342
+1996 60 7217054942
+1997 60 7241219598
+1998 60 4134876965
+1992 61 6439487815
+1993 61 6190501096
+1994 61 6658242784
+1995 61 6300444895
+1996 61 6394989839
+1997 61 6372986872
+1998 61 3692782928
+1992 62 7142709582
+1993 62 6575099186
+1994 62 6577906605
+1995 62 6758016505
+1996 62 6713821475
+1997 62 7061699626
+1998 62 3911733232
+1992 63 6684932832
+1993 63 6784872415
+1994 63 6771692541
+1995 63 6832689629
+1996 63 6769695502
+1997 63 6801959247
+1998 63 3916910435
+1992 64 6403427844
+1993 64 6686657397
+1994 64 6560285004
+1995 64 6654877138
+1996 64 6403809726
+1997 64 6364910756
+1998 64 3757788047
+1992 65 6800534485
+1993 65 6932192888
+1994 65 6599703796
+1995 65 6950320978
+1996 65 6745507185
+1997 65 6965554062
+1998 65 3856421228
+1992 66 6608507118
+1993 66 6720022834
+1994 66 7249477139
+1995 66 6982989122
+1996 66 6895681155
+1997 66 7131587724
+1998 66 4050936159
+1992 67 6789994724
+1993 67 7034832635
+1994 67 6533866956
+1995 67 7089400123
+1996 67 6950690822
+1997 67 6872602250
+1998 67 3798832673
+1992 68 6761138392
+1993 68 7117328614
+1994 68 7003067656
+1995 68 6916376148
+1996 68 6810961498
+1997 68 6421432868
+1998 68 4365901362
+1992 69 6333970291
+1993 69 6591672386
+1994 69 6491372066
+1995 69 6759048824
+1996 69 6636341404
+1997 69 6396375726
+1998 69 3755850783
+1992 70 6863351080
+1993 70 7236349480
+1994 70 7065985619
+1995 70 6799040388
+1996 70 7281402064
+1997 70 6735307561
+1998 70 4062655575
+1992 71 6978088606
+1993 71 6615095404
+1994 71 6642491845
+1995 71 7135465638
+1996 71 6904578270
+1997 71 6886861519
+1998 71 3971062487
+1992 72 6077239048
+1993 72 6379459453
+1994 72 6452415472
+1995 72 6170313509
+1996 72 5916688379
+1997 72 5963369350
+1998 72 3683718797
+1992 73 6671048755
+1993 73 6565112476
+1994 73 6641285247
+1995 73 6887663633
+1996 73 6439642020
+1997 73 6675192946
+1998 73 3814007830
+1992 74 6999195521
+1993 74 7007686388
+1994 74 6670519880
+1995 74 6744064671
+1996 74 6614217057
+1997 74 6523268368
+1998 74 4023666133
+1992 75 6627416528
+1993 75 6758016664
+1994 75 6751975322
+1995 75 7047693486
+1996 75 6567430366
+1997 75 6781762704
+1998 75 4063152322
+1992 76 6785625804
+1993 76 6930340135
+1994 76 6382873777
+1995 76 6206415993
+1996 76 6805542040
+1997 76 6422414358
+1998 76 4087738859
+1992 77 6848387744
+1993 77 6623249454
+1994 77 6588036917
+1995 77 6589295276
+1996 77 6603676047
+1997 77 6383121125
+1998 77 4063691471
+1992 78 6240883199
+1993 78 6551226256
+1994 78 6647824791
+1995 78 6494311762
+1996 78 6358269587
+1997 78 6349078074
+1998 78 3890548095
+1992 79 6948601533
+1993 79 7058895576
+1994 79 7280306702
+1995 79 7174749606
+1996 79 7134521672
+1997 79 7009756092
+1998 79 4233289127
+Res Count: 280
+Time Taken Total: 2593.88
+{"query":21,"time_query":2593.77}
+==PROF== Disconnected from process 4547
+[4547] crystal_q21@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       724.26
+    SM Frequency            cycle/nsecond         1.09
+    Elapsed Cycles                  cycle         4760
+    Memory Throughput                   %         7.38
+    DRAM Throughput                     %         7.38
+    Duration                      usecond         4.35
+    L1/TEX Cache Throughput             %         6.14
+    L2 Cache Throughput                 %         2.74
+    SM Active Cycles                cycle      1461.66
+    Compute (SM) Throughput             %         1.84
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.23
+    Executed Ipc Elapsed  inst/cycle         0.07
+    Issue Slots Busy               %         5.98
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         5.98
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.19%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        54.76
+    Mem Busy                     %         2.51
+    Max Bandwidth                %         7.38
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         4.02
+    Mem Pipes Busy               %         0.58
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.24
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.76
+    Active Warps Per Scheduler          warp         0.99
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.62%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 16.0 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.99 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        15.86
+    Warp Cycles Per Executed Instruction           cycle        16.72
+    Avg. Active Threads Per Warp                                16.20
+    Avg. Not Predicated Off Threads Per Warp                    15.19
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.29%                                                                                          
+          On average, each warp of this kernel spends 5.1 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 32.3% of the total average of 15.9 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.9662%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.2 threads being active per cycle. This is further reduced    
+          to 15.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        82.96
+    Executed Instructions                           inst        26546
+    Avg. Issued Instructions Per Scheduler          inst        87.46
+    Issued Instructions                             inst        27986
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              16
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 40 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           32
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.10
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 92.62%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       1.47
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.989%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       742.03
+    SM Frequency            cycle/nsecond         1.10
+    Elapsed Cycles                  cycle        22119
+    Memory Throughput                   %        76.12
+    DRAM Throughput                     %        76.12
+    Duration                      usecond        20.06
+    L1/TEX Cache Throughput             %        21.14
+    L2 Cache Throughput                 %        26.95
+    SM Active Cycles                cycle     19523.90
+    Compute (SM) Throughput             %        13.82
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.62
+    Executed Ipc Elapsed  inst/cycle         0.55
+    Issue Slots Busy               %        15.59
+    Issued Ipc Active     inst/cycle         0.62
+    SM Busy                        %        15.59
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 90.81%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       578.36
+    Mem Busy                     %        26.95
+    Max Bandwidth                %        76.12
+    L1/TEX Hit Rate              %         0.01
+    L2 Hit Rate                  %         8.85
+    Mem Pipes Busy               %         7.03
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.631%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.2 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        15.62
+    Issued Warp Per Scheduler                        0.16
+    No Eligible                            %        84.38
+    Active Warps Per Scheduler          warp        11.94
+    Eligible Warps Per Scheduler        warp         0.27
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 23.88%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 6.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          11.94 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        76.44
+    Warp Cycles Per Executed Instruction           cycle        77.06
+    Avg. Active Threads Per Warp                                16.02
+    Avg. Not Predicated Off Threads Per Warp                    15.64
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 23.88%                                                                                          
+          On average, each warp of this kernel spends 39.2 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 51.3% of the total average of 76.4 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.064%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.0 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      3019.96
+    Executed Instructions                           inst       966388
+    Avg. Issued Instructions Per Scheduler          inst      3044.38
+    Issued Instructions                             inst       974200
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                1.22
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the    
+          target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical       
+          occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 283 thread blocks.  
+          Under the assumption of a uniform execution duration of all thread blocks, the partial wave may account for   
+          up to 50.0% of the total kernel runtime with a lower occupancy of 24.8%. Try launching a grid with no         
+          partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for  
+          a grid. See the Hardware Model                                                                                
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more      
+          details on launch configurations.                                                                             
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        75.15
+    Achieved Active Warps Per SM           warp        48.10
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 23.88%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (75.2%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.12
+    Branch Instructions              inst       118258
+    Branch Efficiency                   %        72.76
+    Avg. Divergent Branches                      42.58
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.784%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 23924 excessive sectors (7% of the total  
+          360344 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       635.34
+    SM Frequency            cycle/usecond       948.46
+    Elapsed Cycles                  cycle         4038
+    Memory Throughput                   %         1.61
+    DRAM Throughput                     %         1.61
+    Duration                      usecond         4.26
+    L1/TEX Cache Throughput             %        25.95
+    L2 Cache Throughput                 %         1.40
+    SM Active Cycles                cycle       155.38
+    Compute (SM) Throughput             %         0.23
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.22
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         5.96
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         5.96
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.8%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        10.45
+    Mem Busy                     %         0.89
+    Max Bandwidth                %         1.61
+    L1/TEX Hit Rate              %         3.89
+    L2 Hit Rate                  %        41.49
+    Mem Pipes Busy               %         0.17
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.02324%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.3 sectors per request, or 9.3*32 = 297.2 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1165%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.23
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.77
+    Active Warps Per Scheduler          warp         1.04
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.77%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 16.1 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.04 active warps per scheduler, but only an average of 0.06 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        16.69
+    Warp Cycles Per Executed Instruction           cycle        17.77
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    28.35
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 31.43%                                                                                          
+          On average, each warp of this kernel spends 5.2 cycles being stalled waiting for an immediate constant cache  
+          (IMC) miss. A read from constant memory costs one memory read from device memory only on a cache miss;        
+          otherwise, it just costs one read from the constant cache. Immediate constants are encoded into the SASS      
+          instruction as 'c[bank][offset]'. Accesses to different addresses by threads within a warp are serialized,    
+          thus the cost scales linearly with the number of unique addresses read by all threads within a warp. As       
+          such, the constant cache is best when threads in the same warp access only a few distinct locations. If all   
+          threads of a warp access the same location, then constant memory can be as fast as a register access. This    
+          stall type represents about 31.4% of the total average of 16.7 cycles between issuing two instructions.       
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst         8.69
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst         9.26
+    Issued Instructions                             inst         2962
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              30
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 93.75%                                                                                          
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.16
+    Achieved Active Warps Per SM           warp         3.94
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 93.77%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.2%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.84%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       875.26
+    SM Frequency            cycle/nsecond         1.31
+    Elapsed Cycles                  cycle      2172047
+    Memory Throughput                   %        98.91
+    DRAM Throughput                     %        98.91
+    Duration                      msecond         1.66
+    L1/TEX Cache Throughput             %        42.30
+    L2 Cache Throughput                 %        44.05
+    SM Active Cycles                cycle   2167696.45
+    Compute (SM) Throughput             %        24.73
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.99
+    Executed Ipc Elapsed  inst/cycle         0.99
+    Issue Slots Busy               %        24.77
+    Issued Ipc Active     inst/cycle         0.99
+    SM Busy                        %        24.77
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 83.3%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       886.49
+    Mem Busy                     %        44.05
+    Max Bandwidth                %        98.91
+    L1/TEX Hit Rate              %        44.72
+    L2 Hit Rate                  %        34.93
+    Mem Pipes Busy               %         8.01
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.632%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.8 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 8.7 sectors per request, or 8.7*32 = 277.5 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.8 byte accesses would result in 4.8*32 = 155.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 25.12%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.7 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.5529%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 66.02%                                                                                          
+          The memory access pattern for loads from device memory causes 46,014,514 sectors to be read from DRAM, which  
+          is 1.2x of the 38,009,011 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        24.77
+    Issued Warp Per Scheduler                        0.25
+    No Eligible                            %        75.23
+    Active Warps Per Scheduler          warp        14.62
+    Eligible Warps Per Scheduler        warp         0.43
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 1.091%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.0 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          14.62 active warps per scheduler, but only an average of 0.43 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        59.03
+    Warp Cycles Per Executed Instruction           cycle        59.03
+    Avg. Active Threads Per Warp                                20.46
+    Avg. Not Predicated Off Threads Per Warp                    18.81
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 1.091%                                                                                          
+          On average, each warp of this kernel spends 50.0 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 84.6% of the total average of 59.0 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 10.19%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.5 threads being active per cycle. This is further reduced    
+          to 18.8 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    536828.58
+    Executed Instructions                           inst    171785146
+    Avg. Issued Instructions Per Scheduler          inst    536883.60
+    Issued Instructions                             inst    171802752
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              29
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.42
+    Achieved Active Warps Per SM           warp        58.51
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst     19141534
+    Branch Efficiency                   %        68.81
+    Avg. Divergent Branches                    8222.73
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 58.5%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 60857602 excessive sectors (59% of the    
+          total 103860683 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source      
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal/crystal_q31_sf10.txt b/results/V100/crystal/crystal_q31_sf10.txt
new file mode 100644
index 0000000..ce87ca8
--- /dev/null
+++ b/results/V100/crystal/crystal_q31_sf10.txt
@@ -0,0 +1,920 @@
+==PROF== Connected to process 5507 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal/src/crystal_q31)
+Using device 0: Tesla V100-SXM2-16GB (PTX version 700, SM700, 80 SMs, 15754 free / 16151 total MB physmem, 898.048 GB/s @ 877000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_d" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 3: 0%....50%....100% - 74 passes
+Result:
+1992 8 8 53664098547
+1993 8 8 53051563726
+1994 8 8 53551966681
+1995 8 8 53338395993
+1996 8 8 53781237952
+1997 8 8 53558132271
+1992 9 8 55867859815
+1993 9 8 55345162638
+1994 9 8 55589883121
+1995 9 8 54871630692
+1996 9 8 55620205618
+1997 9 8 54852742519
+1992 12 8 52867359425
+1993 12 8 53435367523
+1994 12 8 52283824959
+1995 12 8 52956472988
+1996 12 8 52948768521
+1997 12 8 52962165616
+1992 18 8 53592522758
+1993 18 8 52996000810
+1994 18 8 52962120320
+1995 18 8 53924104344
+1996 18 8 53634737856
+1997 18 8 54307983851
+1992 21 8 53816502394
+1993 21 8 54349264842
+1994 21 8 54119359035
+1995 21 8 53961984627
+1996 21 8 54294333705
+1997 21 8 53703384515
+1992 8 9 55444214883
+1993 8 9 55740793389
+1994 8 9 55137400588
+1995 8 9 55784172640
+1996 8 9 56378453713
+1997 8 9 55399009353
+1992 9 9 57271740148
+1993 9 9 58216495642
+1994 9 9 57507217082
+1995 9 9 57860170696
+1996 9 9 58662284841
+1997 9 9 56940173344
+1992 12 9 55432874858
+1993 12 9 55398755151
+1994 12 9 55206960389
+1995 12 9 55581754250
+1996 12 9 55487324569
+1997 12 9 53582297974
+1992 18 9 56370007920
+1993 18 9 56166403334
+1994 18 9 55432079732
+1995 18 9 55973419507
+1996 18 9 56254723722
+1997 18 9 55830709236
+1992 21 9 56359335068
+1993 21 9 56885558074
+1994 21 9 56507097670
+1995 21 9 57465742525
+1996 21 9 56177166557
+1997 21 9 56333135444
+1992 8 12 51295873912
+1993 8 12 52384079867
+1994 8 12 52254716872
+1995 8 12 51669051730
+1996 8 12 52670597733
+1997 8 12 53782563068
+1992 9 12 54255769995
+1993 9 12 53477912258
+1994 9 12 53868848846
+1995 9 12 54310027205
+1996 9 12 55409865859
+1997 9 12 54099065304
+1992 12 12 52584065821
+1993 12 12 52637339531
+1994 12 12 50154194273
+1995 12 12 51904425056
+1996 12 12 52493537142
+1997 12 12 50634790895
+1992 18 12 52896145835
+1993 18 12 53112435531
+1994 18 12 52021625515
+1995 18 12 52031180987
+1996 18 12 53022298730
+1997 18 12 53294469049
+1992 21 12 53284643553
+1993 21 12 53900783410
+1994 21 12 53648011682
+1995 21 12 53376554374
+1996 21 12 52174060166
+1997 21 12 52785883863
+1992 8 18 51873441494
+1993 8 18 51961213538
+1994 8 18 52868608376
+1995 8 18 52738284867
+1996 8 18 51678789303
+1997 8 18 51787339279
+1992 9 18 53893325353
+1993 9 18 54178339670
+1994 9 18 54059232642
+1995 9 18 53920766480
+1996 9 18 54128092218
+1997 9 18 54349079982
+1992 12 18 51449505308
+1993 12 18 51384752707
+1994 12 18 52195482938
+1995 12 18 51205040497
+1996 12 18 51165908280
+1997 12 18 52167794260
+1992 18 18 53246367726
+1993 18 18 52211194809
+1994 18 18 52388807873
+1995 18 18 52459889035
+1996 18 18 53737304610
+1997 18 18 52772297391
+1992 21 18 53752784633
+1993 21 18 53723459056
+1994 21 18 52734575706
+1995 21 18 52810670641
+1996 21 18 53606892262
+1997 21 18 52841307001
+1992 8 21 49589186930
+1993 8 21 50874540178
+1994 8 21 50484052905
+1995 8 21 50476123376
+1996 8 21 51102099810
+1997 8 21 51376581082
+1992 9 21 51183086614
+1993 9 21 51849557513
+1994 9 21 51912335762
+1995 9 21 51737313715
+1996 9 21 52987320706
+1997 9 21 51870436294
+1992 12 21 49502367103
+1993 12 21 49962826767
+1994 12 21 50112754286
+1995 12 21 48732674673
+1996 12 21 50123146827
+1997 12 21 49094088315
+1992 18 21 50957655153
+1993 18 21 50627753769
+1994 18 21 50537890156
+1995 18 21 50265160335
+1996 18 21 50774431442
+1997 18 21 51103107061
+1992 21 21 49934446612
+1993 21 21 51562382531
+1994 21 21 50180119681
+1995 21 21 51221558310
+1996 21 21 50423672514
+1997 21 21 50461561884
+Res Count: 150
+Time Taken Total: 5725.49
+{"query":31,"time_query":5725.32}
+==PROF== Disconnected from process 5507
+[5507] crystal_q31@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       636.94
+    SM Frequency            cycle/usecond       953.12
+    Elapsed Cycles                  cycle         4792
+    Memory Throughput                   %        11.38
+    DRAM Throughput                     %        11.38
+    Duration                      usecond         5.02
+    L1/TEX Cache Throughput             %        10.74
+    L2 Cache Throughput                 %         4.91
+    SM Active Cycles                cycle      1575.46
+    Compute (SM) Throughput             %         2.01
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.23
+    Executed Ipc Elapsed  inst/cycle         0.08
+    Issue Slots Busy               %         6.10
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         6.10
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.21%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        74.20
+    Mem Busy                     %         4.66
+    Max Bandwidth                %        11.38
+    L1/TEX Hit Rate              %         0.39
+    L2 Hit Rate                  %        23.84
+    Mem Pipes Busy               %         0.91
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.03593%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 153.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.4492%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.52
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        93.48
+    Active Warps Per Scheduler          warp         1.01
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 88.62%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 15.3 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.01 active warps per scheduler, but only an average of 0.07 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        15.46
+    Warp Cycles Per Executed Instruction           cycle        16.22
+    Avg. Active Threads Per Warp                                16.61
+    Avg. Not Predicated Off Threads Per Warp                    15.69
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 32.84%                                                                                          
+          On average, each warp of this kernel spends 5.1 cycles being stalled waiting for a scoreboard dependency on a 
+          L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon  
+          to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory      
+          access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing    
+          data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to   
+          shared memory. This stall type represents about 32.8% of the total average of 15.5 cycles between issuing     
+          two instructions.                                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 1.022%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.6 threads being active per cycle. This is further reduced    
+          to 15.7 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        91.55
+    Executed Instructions                           inst        29295
+    Avg. Issued Instructions Per Scheduler          inst        96.05
+    Issued Instructions                             inst        30735
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 40 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.08
+    Achieved Active Warps Per SM           warp         3.89
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 88.62%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         3201
+    Branch Efficiency                   %        63.81
+    Avg. Divergent Branches                       1.47
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 12.45%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 4534 excessive sectors (34% of the total  
+          13506 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.     
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       746.67
+    SM Frequency            cycle/nsecond         1.10
+    Elapsed Cycles                  cycle        13268
+    Memory Throughput                   %        61.07
+    DRAM Throughput                     %        61.07
+    Duration                      usecond           12
+    L1/TEX Cache Throughput             %        38.39
+    L2 Cache Throughput                 %        25.50
+    SM Active Cycles                cycle     11053.23
+    Compute (SM) Throughput             %        10.43
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.49
+    Executed Ipc Elapsed  inst/cycle         0.41
+    Issue Slots Busy               %        12.48
+    Issued Ipc Active     inst/cycle         0.50
+    SM Busy                        %        12.48
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.93%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       466.95
+    Mem Busy                     %        25.24
+    Max Bandwidth                %        61.07
+    L1/TEX Hit Rate              %         0.18
+    L2 Hit Rate                  %        23.28
+    Mem Pipes Busy               %         4.87
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.2085%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.9 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.451%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.85
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        87.15
+    Active Warps Per Scheduler          warp         5.90
+    Eligible Warps Per Scheduler        warp         0.18
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 38.93%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          5.90 active warps per scheduler, but only an average of 0.18 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        45.89
+    Warp Cycles Per Executed Instruction           cycle        46.73
+    Avg. Active Threads Per Warp                                16.44
+    Avg. Not Predicated Off Threads Per Warp                    15.57
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 38.93%                                                                                          
+          On average, each warp of this kernel spends 24.0 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 52.3% of the total average of 45.9 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 5.355%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.4 threads being active per cycle. This is further reduced    
+          to 15.6 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      1355.03
+    Executed Instructions                           inst       433611
+    Avg. Issued Instructions Per Scheduler          inst      1379.77
+    Issued Instructions                             inst       441525
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                0.46
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        36.35
+    Achieved Active Warps Per SM           warp        23.27
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 38.93%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (36.4%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46902
+    Branch Efficiency                   %        62.57
+    Avg. Divergent Branches                      21.97
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 26.59%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68642 excessive sectors (34% of the total 
+          203290 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       633.81
+    SM Frequency            cycle/usecond       941.84
+    Elapsed Cycles                  cycle         4703
+    Memory Throughput                   %         1.27
+    DRAM Throughput                     %         1.27
+    Duration                      usecond         4.99
+    L1/TEX Cache Throughput             %        19.79
+    L2 Cache Throughput                 %         1.05
+    SM Active Cycles                cycle       177.95
+    Compute (SM) Throughput             %         0.26
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.26
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         6.80
+    Issued Ipc Active     inst/cycle         0.27
+    SM Busy                        %         6.80
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.45%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second         8.23
+    Mem Busy                     %         0.69
+    Max Bandwidth                %         1.27
+    L1/TEX Hit Rate              %         0.90
+    L2 Hit Rate                  %        40.16
+    Mem Pipes Busy               %         0.12
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.01775%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.2 sectors per request, or 9.2*32 = 293.1 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.08651%                                                                                        
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.4 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.57
+    Issued Warp Per Scheduler                        0.07
+    No Eligible                            %        93.43
+    Active Warps Per Scheduler          warp         0.97
+    Eligible Warps Per Scheduler        warp         0.07
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.43%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 15.2 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.97 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        14.80
+    Warp Cycles Per Executed Instruction           cycle        15.52
+    Avg. Active Threads Per Warp                                31.82
+    Avg. Not Predicated Off Threads Per Warp                    28.27
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        11.54
+    Executed Instructions                           inst         3693
+    Avg. Issued Instructions Per Scheduler          inst        12.10
+    Issued Instructions                             inst         3873
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              21
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 93.75%                                                                                          
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.16
+    Achieved Active Warps Per SM           warp         3.94
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 93.43%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.2%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst          373
+    Branch Efficiency                   %        97.40
+    Avg. Divergent Branches                       0.02
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 5.217%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 700 excessive sectors (37% of the total   
+          1888 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       875.58
+    SM Frequency            cycle/nsecond         1.31
+    Elapsed Cycles                  cycle      3667299
+    Memory Throughput                   %        43.86
+    DRAM Throughput                     %        43.86
+    Duration                      msecond         2.80
+    L1/TEX Cache Throughput             %        48.89
+    L2 Cache Throughput                 %        39.47
+    SM Active Cycles                cycle   3679861.62
+    Compute (SM) Throughput             %        18.19
+    ----------------------- ------------- ------------
+
+    OPT   This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance 
+          of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate    
+          latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.72
+    Executed Ipc Elapsed  inst/cycle         0.73
+    Issue Slots Busy               %        18.12
+    Issued Ipc Active     inst/cycle         0.72
+    SM Busy                        %        18.12
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 87.46%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       393.21
+    Mem Busy                     %        39.47
+    Max Bandwidth                %        43.86
+    L1/TEX Hit Rate              %        23.34
+    L2 Hit Rate                  %        63.55
+    Mem Pipes Busy               %         6.04
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.635%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          5.4 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 7.6 sectors per request, or 7.6*32 = 244.4 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 5.4 byte accesses would result in 5.4*32 = 173.8 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 23.27%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.4 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.039%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        18.23
+    Issued Warp Per Scheduler                        0.18
+    No Eligible                            %        81.77
+    Active Warps Per Scheduler          warp        14.64
+    Eligible Warps Per Scheduler        warp         0.35
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 56.14%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 5.5 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          14.64 active warps per scheduler, but only an average of 0.35 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        80.33
+    Warp Cycles Per Executed Instruction           cycle        80.34
+    Avg. Active Threads Per Warp                                17.95
+    Avg. Not Predicated Off Threads Per Warp                    16.49
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 56.14%                                                                                          
+          On average, each warp of this kernel spends 49.5 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 61.6% of the total average of 80.3 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 8.82%                                                                                           
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 18.0 threads being active per cycle. This is further reduced    
+          to 16.5 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    666832.43
+    Executed Instructions                           inst    213386379
+    Avg. Issued Instructions Per Scheduler          inst    666893.85
+    Issued Instructions                             inst    213406031
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              32
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.56
+    Achieved Active Warps Per SM           warp        58.60
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.10
+    Branch Instructions              inst     20529640
+    Branch Efficiency                   %        54.31
+    Avg. Divergent Branches                   12043.76
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 50.11%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 53746246 excessive sectors (50% of the    
+          total 108085518 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source      
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/results/V100/crystal/crystal_q41_sf10_v100.txt b/results/V100/crystal/crystal_q41_sf10_v100.txt
new file mode 100644
index 0000000..378e0f0
--- /dev/null
+++ b/results/V100/crystal/crystal_q41_sf10_v100.txt
@@ -0,0 +1,971 @@
+==PROF== Connected to process 3449 (/home/ubuntu/fff/cmake-build-release-g4dn/gpu/crystal/src/crystal_q41)
+Using device 0: Tesla V100-SXM2-16GB (PTX version 700, SM700, 80 SMs, 15754 free / 16151 total MB physmem, 898.048 GB/s @ 877000 kHz mem clock, ECC on)
+** LOADED DATA **
+** LOADED DATA TO GPU **
+==PROF== Profiling "build_hashtable_s" - 0: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_c" - 1: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_p" - 2: 0%....50%....100% - 73 passes
+==PROF== Profiling "build_hashtable_d" - 3: 0%....50%....100% - 73 passes
+==PROF== Profiling "probe" - 4: 0%....50%....100% - 74 passes
+Result:
+1992 1 103225040658
+1993 1 105193302842
+1994 1 103837804124
+1995 1 103659981621
+1996 1 103616722233
+1997 1 103157005314
+1998 1 61159340206
+1992 2 106678259181
+1993 2 105849020253
+1994 2 106216978529
+1995 2 107035371791
+1996 2 105292362331
+1997 2 105381211263
+1998 2 61616122837
+1992 3 106953585129
+1993 3 106242432020
+1994 3 105405953212
+1995 3 106496045663
+1996 3 106452120723
+1997 3 106618275297
+1998 3 61766210322
+1992 17 103623138817
+1993 17 104974876956
+1994 17 103731557899
+1995 17 103730419480
+1996 17 104874194133
+1997 17 102847514868
+1998 17 61002354487
+1992 24 106223564390
+1993 24 105649036141
+1994 24 106076726307
+1995 24 105177111217
+1996 24 103976579696
+1997 24 104638539353
+1998 24 60962148771
+Res Count: 35
+Time Taken Total: 3177.99
+{"query":41,"time_query":3177.86}
+==PROF== Disconnected from process 3449
+[3449] crystal_q41@127.0.0.1
+  void build_hashtable_s<(int)128, (int)4>(int *, int *, int, int *, int) (40, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       766.67
+    SM Frequency            cycle/nsecond         1.15
+    Elapsed Cycles                  cycle         4964
+    Memory Throughput                   %         7.03
+    DRAM Throughput                     %         7.03
+    Duration                      usecond         4.32
+    L1/TEX Cache Throughput             %         6.27
+    L2 Cache Throughput                 %         2.63
+    SM Active Cycles                cycle      1432.25
+    Compute (SM) Throughput             %         1.76
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.23
+    Executed Ipc Elapsed  inst/cycle         0.07
+    Issue Slots Busy               %         6.11
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         6.11
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 96.11%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        55.17
+    Mem Busy                     %         2.41
+    Max Bandwidth                %         7.03
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         4.04
+    Mem Pipes Busy               %         0.56
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.33
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.67
+    Active Warps Per Scheduler          warp         1.01
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 92.97%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 15.8 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          1.01 active warps per scheduler, but only an average of 0.06 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        16.02
+    Warp Cycles Per Executed Instruction           cycle        16.89
+    Avg. Active Threads Per Warp                                16.20
+    Avg. Not Predicated Off Threads Per Warp                    15.19
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 0.9262%                                                                                         
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.2 threads being active per cycle. This is further reduced    
+          to 15.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst        82.96
+    Executed Instructions                           inst        26546
+    Avg. Issued Instructions Per Scheduler          inst        87.46
+    Issued Instructions                             inst        27986
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                     40
+    Registers Per Thread             register/thread              16
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread            5120
+    Waves Per SM                                                0.03
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          The grid for this launch is configured to execute only 40 blocks, which is less than the GPU's 80             
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           32
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.10
+    Achieved Active Warps Per SM           warp         3.90
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 92.97%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst         2876
+    Branch Efficiency                   %        58.57
+    Avg. Divergent Branches                       1.47
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.76%                                                                                           
+          This kernel has uncoalesced global accesses resulting in a total of 1441 excessive sectors (20% of the total  
+          7183 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_c<(int)128, (int)4>(int *, int *, int *, int, int *, int) (586, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       738.93
+    SM Frequency            cycle/nsecond         1.10
+    Elapsed Cycles                  cycle        13484
+    Memory Throughput                   %        60.11
+    DRAM Throughput                     %        60.11
+    Duration                      usecond        12.29
+    L1/TEX Cache Throughput             %        37.49
+    L2 Cache Throughput                 %        24.86
+    SM Active Cycles                cycle     11023.86
+    Compute (SM) Throughput             %        10.24
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.49
+    Executed Ipc Elapsed  inst/cycle         0.40
+    Issue Slots Busy               %        12.51
+    Issued Ipc Active     inst/cycle         0.50
+    SM Busy                        %        12.51
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 91.92%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       454.85
+    Mem Busy                     %        24.67
+    Max Bandwidth                %        60.11
+    L1/TEX Hit Rate              %         0.21
+    L2 Hit Rate                  %        23.18
+    Mem Pipes Busy               %         4.78
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.1989%                                                                                         
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 4.8 sectors per request, or 4.8*32 = 154.1 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 2.392%                                                                                          
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.3 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        12.86
+    Issued Warp Per Scheduler                        0.13
+    No Eligible                            %        87.14
+    Active Warps Per Scheduler          warp         5.95
+    Eligible Warps Per Scheduler        warp         0.17
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 39.89%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 7.8 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          5.95 active warps per scheduler, but only an average of 0.17 warps were eligible per cycle. Eligible warps    
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        46.24
+    Warp Cycles Per Executed Instruction           cycle        47.08
+    Avg. Active Threads Per Warp                                16.40
+    Avg. Not Predicated Off Threads Per Warp                    15.53
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 39.89%                                                                                          
+          On average, each warp of this kernel spends 23.0 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 49.8% of the total average of 46.2 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 5.268%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 16.4 threads being active per cycle. This is further reduced    
+          to 15.5 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      1354.60
+    Executed Instructions                           inst       433471
+    Avg. Issued Instructions Per Scheduler          inst      1379.10
+    Issued Instructions                             inst       441313
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                    586
+    Registers Per Thread             register/thread              18
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread           75008
+    Waves Per SM                                                0.46
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        36.80
+    Achieved Active Warps Per SM           warp        23.55
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 39.89%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (36.8%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst        46897
+    Branch Efficiency                   %        62.60
+    Avg. Divergent Branches                      21.95
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 26.57%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 68196 excessive sectors (34% of the total 
+          202634 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_p<(int)128, (int)4>(int *, int *, int, int *, int) (1563, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       704.74
+    SM Frequency            cycle/nsecond         1.04
+    Elapsed Cycles                  cycle        19802
+    Memory Throughput                   %        73.50
+    DRAM Throughput                     %        73.50
+    Duration                      usecond        18.91
+    L1/TEX Cache Throughput             %        41.05
+    L2 Cache Throughput                 %        27.10
+    SM Active Cycles                cycle     17691.62
+    Compute (SM) Throughput             %        18.31
+    ----------------------- ------------- ------------
+
+    OPT   Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the    
+          DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the       
+          bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or  
+          whether there are values you can (re)compute.                                                                 
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.81
+    Executed Ipc Elapsed  inst/cycle         0.73
+    Issue Slots Busy               %        20.46
+    Issued Ipc Active     inst/cycle         0.82
+    SM Busy                        %        20.46
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 86.3%                                                                                     
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       530.43
+    Mem Busy                     %        25.59
+    Max Bandwidth                %        73.50
+    L1/TEX Hit Rate              %            0
+    L2 Hit Rate                  %         5.04
+    Mem Pipes Busy               %         5.54
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        20.54
+    Issued Warp Per Scheduler                        0.21
+    No Eligible                            %        79.46
+    Active Warps Per Scheduler          warp        11.37
+    Eligible Warps Per Scheduler        warp         0.31
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 26.5%                                                                                     
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 4.9 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          11.37 active warps per scheduler, but only an average of 0.31 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, reduce the time the active warps are stalled by inspecting the top stall reasons on the Warp  
+          State Statistics and Source Counters sections.                                                                
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        55.34
+    Warp Cycles Per Executed Instruction           cycle        55.72
+    Avg. Active Threads Per Warp                                20.83
+    Avg. Not Predicated Off Threads Per Warp                    19.27
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 26.5%                                                                                           
+          On average, each warp of this kernel spends 29.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 52.6% of the total average of 55.3 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 7.288%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 20.8 threads being active per cycle. This is further reduced    
+          to 19.3 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst      3594.60
+    Executed Instructions                           inst      1150272
+    Avg. Issued Instructions Per Scheduler          inst      3618.85
+    Issued Instructions                             inst      1158033
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                   1563
+    Registers Per Thread             register/thread              19
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread          200064
+    Waves Per SM                                                1.22
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 50%                                                                                             
+          A wave of thread blocks is defined as the maximum number of blocks that can be executed in parallel on the    
+          target GPU. The number of blocks in a wave depends on the number of multiprocessors and the theoretical       
+          occupancy of the kernel. This kernel launch results in 1 full waves and a partial wave of 283 thread blocks.  
+          Under the assumption of a uniform execution duration of all thread blocks, the partial wave may account for   
+          up to 50.0% of the total kernel runtime with a lower occupancy of 29.7%. Try launching a grid with no         
+          partial wave. The overall impact of this tail effect also lessens with the number of full waves executed for  
+          a grid. See the Hardware Model                                                                                
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model) description for more      
+          details on launch configurations.                                                                             
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           21
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        70.29
+    Achieved Active Warps Per SM           warp        44.98
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 26.5%                                                                                           
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (70.3%) can be the     
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst       125048
+    Branch Efficiency                   %        62.52
+    Avg. Divergent Branches                      58.60
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 16.04%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 57145 excessive sectors (19% of the total 
+          308036 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations.    
+          The CUDA Programming Guide                                                                                    
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void build_hashtable_d<(int)128, (int)4>(int *, int *, int, int *, int, int) (5, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       656.72
+    SM Frequency            cycle/usecond       977.61
+    Elapsed Cycles                  cycle         4193
+    Memory Throughput                   %         1.54
+    DRAM Throughput                     %         1.54
+    Duration                      usecond         4.29
+    L1/TEX Cache Throughput             %        25.61
+    L2 Cache Throughput                 %         1.34
+    SM Active Cycles                cycle       157.39
+    Compute (SM) Throughput             %         0.22
+    ----------------------- ------------- ------------
+
+    OPT   This kernel grid is too small to fill the available resources on this device, resulting in only 0.0 full      
+          waves across all SMs. Look at Launch Statistics for more details.                                             
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         0.22
+    Executed Ipc Elapsed  inst/cycle         0.01
+    Issue Slots Busy               %         5.88
+    Issued Ipc Active     inst/cycle         0.24
+    SM Busy                        %         5.88
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 95.85%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second        10.37
+    Mem Busy                     %         0.86
+    Max Bandwidth                %         1.54
+    L1/TEX Hit Rate              %         3.07
+    L2 Hit Rate                  %        41.51
+    Mem Pipes Busy               %         0.16
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 0.02128%                                                                                        
+          The memory access pattern for global stores in L1TEX might not be optimal. On average, this kernel accesses   
+          4.0 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 9.0 sectors per request, or 9.0*32 = 288.0 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.0 byte accesses would result in 4.0*32 = 128.0 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global stores.                                                                                    
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.1124%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 2.5 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %         6.13
+    Issued Warp Per Scheduler                        0.06
+    No Eligible                            %        93.87
+    Active Warps Per Scheduler          warp         0.99
+    Eligible Warps Per Scheduler        warp         0.06
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 93.87%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 16.3 cycles. This might leave hardware resources underutilized and may lead to    
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          0.99 active warps per scheduler, which already limits the scheduler to less than a warp per instruction.      
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        16.21
+    Warp Cycles Per Executed Instruction           cycle        17.25
+    Avg. Active Threads Per Warp                                31.93
+    Avg. Not Predicated Off Threads Per Warp                    28.35
+    ---------------------------------------- ----------- ------------
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst         8.69
+    Executed Instructions                           inst         2782
+    Avg. Issued Instructions Per Scheduler          inst         9.26
+    Issued Instructions                             inst         2962
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                      5
+    Registers Per Thread             register/thread              30
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread             640
+    Waves Per SM                                                0.00
+    -------------------------------- --------------- ---------------
+
+    OPT   Est. Speedup: 93.75%                                                                                          
+          The grid for this launch is configured to execute only 5 blocks, which is less than the GPU's 80              
+          multiprocessors. This can underutilize some multiprocessors. If you do not intend to execute this kernel      
+          concurrently with other workloads, consider reducing the block size to have at least one block per            
+          multiprocessor or increase the size of the grid to fully utilize the available hardware resources. See the    
+          Hardware Model (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-hw-model)            
+          description for more details on launch configurations.                                                        
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %         6.10
+    Achieved Active Warps Per SM           warp         3.91
+    ------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 93.87%                                                                                          
+          The difference between calculated theoretical (100.0%) and measured achieved occupancy (6.1%) can be the      
+          result of warp scheduling overheads or workload imbalances during the kernel execution. Load imbalances can   
+          occur between warps within a block as well as across blocks of the same kernel. See the CUDA Best Practices   
+          Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on     
+          optimizing occupancy.                                                                                         
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.06
+    Branch Instructions              inst          166
+    Branch Efficiency                   %        97.73
+    Avg. Divergent Branches                       0.01
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 6.634%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 812 excessive sectors (39% of the total   
+          2092 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The  
+          CUDA Programming Guide                                                                                        
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
+  void probe<(int)128, (int)4>(int *, int *, int *, int *, int *, int *, int, int *, int, int *, int, int *, int, int *, int, int *) (117161, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 7.0
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/usecond       875.89
+    SM Frequency            cycle/nsecond         1.31
+    Elapsed Cycles                  cycle      2715257
+    Memory Throughput                   %        98.24
+    DRAM Throughput                     %        98.24
+    Duration                      msecond         2.07
+    L1/TEX Cache Throughput             %        38.61
+    L2 Cache Throughput                 %        44.74
+    SM Active Cycles                cycle   2709322.20
+    Compute (SM) Throughput             %        27.37
+    ----------------------- ------------- ------------
+
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing DRAM in the Memory Workload Analysis section.                                              
+
+    Section: GPU Speed Of Light Roofline Chart
+    INF   The ratio of peak float (fp32) to double (fp64) performance on this device is 2:1. The kernel achieved 0% of  
+          this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide       
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline      
+          analysis.                                                                                                     
+
+    Section: Compute Workload Analysis
+    -------------------- ----------- ------------
+    Metric Name          Metric Unit Metric Value
+    -------------------- ----------- ------------
+    Executed Ipc Active   inst/cycle         1.10
+    Executed Ipc Elapsed  inst/cycle         1.09
+    Issue Slots Busy               %        27.43
+    Issued Ipc Active     inst/cycle         1.10
+    SM Busy                        %        27.43
+    -------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 82.06%                                                                                    
+          All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps   
+          per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.             
+
+    Section: Memory Workload Analysis
+    ----------------- ------------ ------------
+    Metric Name        Metric Unit Metric Value
+    ----------------- ------------ ------------
+    Memory Throughput Gbyte/second       881.09
+    Mem Busy                     %        44.74
+    Max Bandwidth                %        98.24
+    L1/TEX Hit Rate              %        38.83
+    L2 Hit Rate                  %        31.19
+    Mem Pipes Busy               %         9.15
+    ----------------- ------------ ------------
+
+    Section: Memory Workload Analysis Chart
+    WRN   The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an  
+          additional metric could enable the rule to provide more guidance.                                             
+
+    Section: Memory Workload Analysis Tables
+    OPT   Est. Speedup: 1.066%                                                                                          
+          The memory access pattern for global loads in L1TEX might not be optimal. On average, this kernel accesses    
+          4.7 bytes per thread per memory request; but the address pattern, possibly caused by the stride between       
+          threads, results in 6.8 sectors per request, or 6.8*32 = 218.9 bytes of cache data transfers per request.     
+          The optimal thread address pattern for 4.7 byte accesses would result in 4.7*32 = 150.1 bytes of cache data   
+          transfers per request, to maximize L1TEX cache performance. Check the Source Counters section for             
+          uncoalesced global loads.                                                                                     
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 22.64%                                                                                          
+          The memory access pattern for loads from L1TEX to L2 is not optimal. The granularity of an L1TEX request to   
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.9 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced loads and try to minimize how many cache lines need to be accessed per memory         
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 0.7463%                                                                                         
+          The memory access pattern for stores from L1TEX to L2 is not optimal. The granularity of an L1TEX request to  
+          L2 is a 128 byte cache line. That is 4 consecutive 32-byte sectors per L2 request. However, this kernel only  
+          accesses an average of 1.0 sectors out of the possible 4 sectors per cache line. Check the Source Counters    
+          section for uncoalesced stores and try to minimize how many cache lines need to be accessed per memory        
+          request.                                                                                                      
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 70.09%                                                                                          
+          The memory access pattern for loads from device memory causes 57,115,204 sectors to be read from DRAM, which  
+          is 1.1x of the 51,057,653 sectors which cause a miss in the L2 cache. The DRAM fetch granularity for read     
+          misses in L2 is 64 bytes, i.e. the lower or upper half of an L2 cache line. Try changing your access pattern  
+          to make use of both sectors returned by a DRAM read request for optimal usage of the DRAM throughput. For     
+          strided memory reads, avoid strides of 64 bytes or larger to avoid moving unused sectors from DRAM to L2.     
+
+    Section: Scheduler Statistics
+    ---------------------------- ----------- ------------
+    Metric Name                  Metric Unit Metric Value
+    ---------------------------- ----------- ------------
+    One or More Eligible                   %        27.43
+    Issued Warp Per Scheduler                        0.27
+    No Eligible                            %        72.57
+    Active Warps Per Scheduler          warp        14.58
+    Eligible Warps Per Scheduler        warp         0.51
+    ---------------------------- ----------- ------------
+
+    OPT   Est. Local Speedup: 1.763%                                                                                    
+          Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 3.6 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 16 warps per scheduler, this kernel allocates an average of   
+          14.58 active warps per scheduler, but only an average of 0.51 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Warp Cycles Per Issued Instruction             cycle        53.16
+    Warp Cycles Per Executed Instruction           cycle        53.16
+    Avg. Active Threads Per Warp                                18.12
+    Avg. Not Predicated Off Threads Per Warp                    16.89
+    ---------------------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 1.763%                                                                                          
+          On average, each warp of this kernel spends 45.1 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited     
+          upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the        
+          memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by        
+          increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently     
+          used data to shared memory. This stall type represents about 84.8% of the total average of 53.2 cycles        
+          between issuing two instructions.                                                                             
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on         
+          sampling data. The Kernel Profiling Guide                                                                     
+          (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details    
+          on each stall reason.                                                                                         
+    ----- --------------------------------------------------------------------------------------------------------------
+    OPT   Est. Speedup: 12.93%                                                                                          
+          Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 18.1 threads being active per cycle. This is further reduced    
+          to 16.9 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible.                 
+
+    Section: Instruction Statistics
+    ---------------------------------------- ----------- ------------
+    Metric Name                              Metric Unit Metric Value
+    ---------------------------------------- ----------- ------------
+    Avg. Executed Instructions Per Scheduler        inst    742969.15
+    Executed Instructions                           inst    237750128
+    Avg. Issued Instructions Per Scheduler          inst    743036.20
+    Issued Instructions                             inst    237771585
+    ---------------------------------------- ----------- ------------
+
+    Section: Launch Statistics
+    -------------------------------- --------------- ---------------
+    Metric Name                          Metric Unit    Metric Value
+    -------------------------------- --------------- ---------------
+    Block Size                                                   128
+    Function Cache Configuration                     CachePreferNone
+    Grid Size                                                 117161
+    Registers Per Thread             register/thread              32
+    Shared Memory Configuration Size            byte               0
+    Driver Shared Memory Per Block        byte/block               0
+    Dynamic Shared Memory Per Block       byte/block               0
+    Static Shared Memory Per Block        byte/block               0
+    Threads                                   thread        14996608
+    Waves Per SM                                               91.53
+    -------------------------------- --------------- ---------------
+
+    Section: Occupancy
+    ------------------------------- ----------- ------------
+    Metric Name                     Metric Unit Metric Value
+    ------------------------------- ----------- ------------
+    Block Limit SM                        block           32
+    Block Limit Registers                 block           16
+    Block Limit Shared Mem                block           32
+    Block Limit Warps                     block           16
+    Theoretical Active Warps per SM        warp           64
+    Theoretical Occupancy                     %          100
+    Achieved Occupancy                        %        91.18
+    Achieved Active Warps Per SM           warp        58.36
+    ------------------------------- ----------- ------------
+
+    Section: Source Counters
+    ------------------------- ----------- ------------
+    Metric Name               Metric Unit Metric Value
+    ------------------------- ----------- ------------
+    Branch Instructions Ratio           %         0.11
+    Branch Instructions              inst     26921784
+    Branch Efficiency                   %        63.70
+    Avg. Divergent Branches                   13291.05
+    ------------------------- ----------- ------------
+
+    OPT   Est. Speedup: 49.27%                                                                                          
+          This kernel has uncoalesced global accesses resulting in a total of 58762179 excessive sectors (49% of the    
+          total 118979581 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source      
+          locations. The CUDA Programming Guide                                                                         
+          (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional      
+          information on reducing uncoalesced device memory accesses.                                                   
+
diff --git a/scripts/ssb_on_duckdb.py b/scripts/ssb_on_duckdb.py
new file mode 100644
index 0000000..fa59ce4
--- /dev/null
+++ b/scripts/ssb_on_duckdb.py
@@ -0,0 +1,9 @@
+from ssb_on_duckdb import *
+
+if __name__ == '__main__':
+    # run_query(Query11)
+    # run_query(Query12)
+    # run_query(Query13)
+    run_query(Query21)
+
+
diff --git a/scripts/ssb_on_duckdb/__init__.py b/scripts/ssb_on_duckdb/__init__.py
new file mode 100644
index 0000000..94d93ee
--- /dev/null
+++ b/scripts/ssb_on_duckdb/__init__.py
@@ -0,0 +1,33 @@
+import duckdb
+
+from .load import *
+from .query_11 import *
+from .query_12 import *
+from .query_13 import *
+from .query_21 import *
+
+
+class Query:
+    name = " "
+    run = ""
+
+    def __init__(self, query, run):
+        self.name = query
+        self.run = run
+
+
+Query11 = Query("query_11", run_modified_query_11)
+Query12 = Query("query_12", run_query_12)
+Query13 = Query("query_13", run_query_13)
+Query21 = Query("query_21", run_query_21)
+
+
+def run_query(query):
+    sf = [1]
+    print("-- name : {0}".format(query.name))
+
+    for sf in SFs:
+        print("---- sf : {0}".format(SFs[sf]))
+        con = duckdb.connect()
+        load_table(con, SFs[sf])
+        query.run(con)
diff --git a/scripts/ssb_on_duckdb/load.py b/scripts/ssb_on_duckdb/load.py
new file mode 100644
index 0000000..2227b3b
--- /dev/null
+++ b/scripts/ssb_on_duckdb/load.py
@@ -0,0 +1,93 @@
+SFs = {
+    #
+    1: "1"
+}
+
+CREATE_CUSTOMER_QUERY = """
+  CREATE TABLE customer AS SELECT *
+  FROM
+  read_csv('{0}/customer.tbl'
+     ,AUTO_DETECT=TRUE
+     ,delim = '|')
+      """
+
+CREATE_DATE_TABLE = '''
+        CREATE TABLE ddate (
+        d_datekey integer not null,
+       d_date char(18) not null,
+       d_dayofweek char(9) not null,
+       d_month char(9) not null,
+       d_year integer not null,
+       d_yearmonthnum integer not null,
+       d_yearmonth char(7) not null,
+       d_daynuminweek integer not null,
+       d_daynuminmonth integer not null,
+       d_daynuminyear integer not null,
+       d_monthnuminyear integer not null,
+       d_weeknuminyear integer not null,
+       d_sellingseasin varchar(12) not null,
+       d_lastdayinweekfl integer not null,
+       d_lastdayinmonthfl integer not null,
+       d_holidayfl integer not null,
+       d_weekdayfl integer not null);
+       COPY ddate FROM '{0}/date.tbl' WITH (HEADER false, DELIMITER '|');
+    '''
+
+CREATE_LINEORDER_TABLE = '''
+        CREATE TABLE lineorder (
+        LO_ORDERKEY UINT32, 
+        LO_LINENUMBER  UINT8,  
+        LO_CUSTKEY  UINT32,  
+        LO_PARTKEY  UINT32,
+        LO_SUPPKEY  UINT32, 
+        LO_ORDERDATE   INT32,  
+        LO_ORDERPRIORITY   string, 
+        LO_SHIPPRIORITY   UINT8,  
+        LO_QUANTITY   INT32,  
+        LO_EXTENDEDPRICE   INT32, 
+        LO_ORDTOTALPRICE   UINT32,  
+        LO_DISCOUNT   INT32,  
+        LO_REVENUE   UINT32, 
+        LO_SUPPLYCOST   UINT32, 
+        LO_TAX   UINT8,  
+        LO_COMMITDATE   UINT64, 
+        LO_SHIPMODE   string);
+        COPY lineorder FROM '{0}/lineorder.tbl' WITH (HEADER false, DELIMITER '|');
+        '''
+
+CREATE_PART_TABLE = '''
+        CREATE TABLE part (
+        P_PARTKEY       UINT32,
+        P_NAME          String,
+        P_MFGR          String,
+        P_CATEGORY      String,
+        P_BRAND1        String,
+        P_COLOR         String,
+        P_TYPE          String,
+        P_SIZE          UINT32,
+        P_CONTAINER     String
+           );
+       COPY part FROM '{0}/part.tbl.p' WITH (HEADER false, DELIMITER '|');
+    '''
+
+CREATE_SUPPLIER_TABLE = '''
+        CREATE TABLE supplier (
+        S_SUPPKEY       UINT32,
+        S_NAME          String,
+        S_ADDRESS       String,
+        S_CITY          String,
+        S_NATION        String,
+        S_REGION        String,
+        S_PHONE         String
+           );
+       COPY supplier FROM '{0}/supplier.tbl.p' WITH (HEADER false, DELIMITER '|');
+    '''
+
+
+def load_table(con, sf):
+    relative_path = "../../gpu/data/ssb/data/s{0}/".format(sf)
+    con.sql(CREATE_CUSTOMER_QUERY.format(relative_path))
+    con.sql(CREATE_LINEORDER_TABLE.format(relative_path))
+    con.sql(CREATE_PART_TABLE.format(relative_path))
+    con.sql(CREATE_SUPPLIER_TABLE.format(relative_path))
+    con.sql(CREATE_DATE_TABLE.format(relative_path))
diff --git a/scripts/ssb_on_duckdb/query_11.py b/scripts/ssb_on_duckdb/query_11.py
new file mode 100644
index 0000000..7471705
--- /dev/null
+++ b/scripts/ssb_on_duckdb/query_11.py
@@ -0,0 +1,26 @@
+QUERY_11 = """
+  select sum(lo_extendedprice * lo_discount) as revenue
+  from lineorder
+  where lo_orderdate >= 19930101 and lo_orderdate <= 19940101 and lo_discount>=1
+  and lo_discount<=3
+  and lo_quantity<25;
+  """
+
+# the result of this query match fls_q11
+MODIFIED_QUERY_11 = """
+  select sum(lo_extendedprice * lo_discount) as revenue
+  from lineorder
+  where lo_orderdate >= 19930000 and lo_orderdate <= 19940000 and lo_discount>=1
+  and lo_discount<=3
+  and lo_quantity<25;
+  """
+
+
+def run_query_11(con):
+    tbl = con.sql(QUERY_11).fetch_arrow_table()
+    print(tbl.to_pandas())
+
+
+def run_modified_query_11(con):
+    tbl = con.sql(MODIFIED_QUERY_11).fetch_arrow_table()
+    print(tbl.to_pandas())
diff --git a/scripts/ssb_on_duckdb/query_12.py b/scripts/ssb_on_duckdb/query_12.py
new file mode 100644
index 0000000..dbc6ed0
--- /dev/null
+++ b/scripts/ssb_on_duckdb/query_12.py
@@ -0,0 +1,13 @@
+QUERY_12 = """
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19940101 and lo_orderdate <= 19940131
+and lo_discount>=4 and lo_discount<=6
+and lo_quantity>=26
+and lo_quantity<=35;
+  """
+
+
+def run_query_12(con):
+    tbl = con.sql(QUERY_12).fetch_arrow_table()
+    print(tbl.to_pandas())
diff --git a/scripts/ssb_on_duckdb/query_13.py b/scripts/ssb_on_duckdb/query_13.py
new file mode 100644
index 0000000..f768dff
--- /dev/null
+++ b/scripts/ssb_on_duckdb/query_13.py
@@ -0,0 +1,15 @@
+QUERY_13 = """
+select sum(lo_extendedprice * lo_discount) as revenue
+from lineorder
+where lo_orderdate >= 19940204
+and lo_orderdate <= 19940210
+and lo_discount>=5
+and lo_discount<=7
+and lo_quantity>=26
+and lo_quantity<=35;
+  """
+
+
+def run_query_13(con):
+    tbl = con.sql(QUERY_13).fetch_arrow_table()
+    print(tbl.to_pandas())
diff --git a/scripts/ssb_on_duckdb/query_21.py b/scripts/ssb_on_duckdb/query_21.py
new file mode 100644
index 0000000..2d18e3b
--- /dev/null
+++ b/scripts/ssb_on_duckdb/query_21.py
@@ -0,0 +1,15 @@
+QUERY_21 = """
+select sum(lo_revenue),d_year,p_brand1
+from lineorder,part,supplier,ddate
+where lo_orderdate = d_datekey
+and lo_partkey = p_partkey
+and lo_suppkey = s_suppkey
+and p_category = 1
+and s_region = 1
+group by d_year,p_brand1;
+  """
+
+
+def run_query_21(con):
+    tbl = con.sql(QUERY_21).fetch_arrow_table()
+    print(tbl.to_pandas())
diff --git a/tile_based/CMakeLists.txt b/tile_based/CMakeLists.txt
new file mode 100644
index 0000000..febd4f0
--- /dev/null
+++ b/tile_based/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(src)
diff --git a/tile_based/README.md b/tile_based/README.md
new file mode 100644
index 0000000..9433f9c
--- /dev/null
+++ b/tile_based/README.md
@@ -0,0 +1 @@
+# does not work.
\ No newline at end of file
diff --git a/tile_based/src/CMakeLists.txt b/tile_based/src/CMakeLists.txt
new file mode 100644
index 0000000..ef92a93
--- /dev/null
+++ b/tile_based/src/CMakeLists.txt
@@ -0,0 +1,43 @@
+add_library(tilebased STATIC tile_based.cu)
+target_include_directories(tilebased PUBLIC include)
+
+# RLE : ----------------------------------------------------------------------------------------------------------------
+#add_executable(tile_based_binpack_query_11 tile_based_binpack_query_11.cu)
+#target_link_libraries(tile_based_binpack_query_11 PUBLIC fastlanes tilebased)
+
+# FOR : ----------------------------------------------------------------------------------------------------------------
+#add_executable(tile_based_binpack tile_based_binpack.cu)
+#target_include_directories(tile_based_binpack PUBLIC include)
+
+#add_executable(tile_based_bench_bitpack tile_based_bench_bitpack.cu)
+#target_link_libraries(tile_based_bench_bitpack PUBLIC tilebased)
+
+add_executable(tile_based_bench_bp_sum tile_based_bench_bp_sum.cu)
+target_link_libraries(tile_based_bench_bp_sum PUBLIC tilebased)
+
+#add_executable(tile_based_bitpack_shared_memory tile_based_bitpack_shared_memory.cu)
+#target_link_libraries(tile_based_bitpack_shared_memory PRIVATE fastlanes gtest_main fastlanes_gpu tilebased)
+
+# Delta : --------------------------------------------------------------------------------------------------------------
+add_executable(tile_based_bench_delta tile_based_bench_delta.cu)
+target_link_libraries(tile_based_bench_delta PUBLIC tilebased)
+
+#add_executable(tile_based_bench_delta_sum tile_based_bench_delta_sum.cu)
+#target_link_libraries(tile_based_bench_delta_sum PUBLIC tilebased)
+
+# RLE : ----------------------------------------------------------------------------------------------------------------
+#add_executable(test_rle_on_cpu test_rle_on_cpu.cpp)
+#target_link_libraries(test_rle_on_cpu PUBLIC tilebased)
+
+#  loadEncodedColumnToGPURLE does not exist.
+#add_executable(tile_based_bench_rle tile_based_bench_rle.cu)
+#target_link_libraries(tile_based_bench_rle PUBLIC tilebased)
+
+add_executable(rlebinpack rlebinpack.cpp)
+target_link_libraries(rlebinpack PUBLIC tilebased)
+
+add_executable(test_match_rle test_match_rle.cu)
+target_link_libraries(test_match_rle PUBLIC tilebased)
+
+add_executable(test_perf_rle test_perf_rle.cu)
+target_link_libraries(test_perf_rle PUBLIC tilebased)
\ No newline at end of file
diff --git a/tile_based/src/config.hpp b/tile_based/src/config.hpp
new file mode 100644
index 0000000..48ae12c
--- /dev/null
+++ b/tile_based/src/config.hpp
@@ -0,0 +1,213 @@
+/*
+-- DATE : 25/09/2023
+-- FILE_PATH : benchmark/include/config/tile_based/config.hpp
+-- PROJECT_NAME : fastlanes_fileformat
+*/
+
+#ifndef FASTLANES_FILEFORMAT_CONFIG_HPP
+#define FASTLANES_FILEFORMAT_CONFIG_HPP
+
+#include <fcntl.h>
+#include <fstream>
+#include <iostream>
+#include <istream>
+#include <string>
+#include <sys/stat.h>
+
+namespace tile_based {
+int delta = 7;
+/* Used in gen.cpp */
+int num_bits = 3;
+/* Used in gen_d1.cpp */
+int num_distinct = 28;
+/* Used in gen_d2.cpp */
+int mean = 4;
+/* Used in gen_d3.cpp */
+int alpha = 4;
+
+struct encoded_column {
+	// block_start[i] = byte at which ith block starts
+	uint* block_start;
+	// raw data
+	uint* data;
+	// number of bytes of raw data
+	uint64_t data_size;
+};
+
+int index_of(std::string* arr, int len, std::string val) {
+	for (int i = 0; i < len; i++) {
+		if (arr[i] == val) { return i; }
+	}
+
+	return -1;
+}
+
+// 16 / 6 / 7 / 8 - not integer columns
+std::string lookup(std::string col_name) {
+	std::string lineorder[] = {"lo_orderkey",
+	                           "lo_linenumber",
+	                           "lo_custkey",
+	                           "lo_partkey",
+	                           "lo_suppkey",
+	                           "lo_orderdate",
+	                           "lo_orderpriority",
+	                           "lo_shippriority",
+	                           "lo_quantity",
+	                           "lo_extendedprice",
+	                           "lo_ordtotalprice",
+	                           "lo_discount",
+	                           "lo_revenue",
+	                           "lo_supplycost",
+	                           "lo_tax",
+	                           "lo_commitdate",
+	                           "lo_shipmode"};
+	std::string part[]      = {
+        "p_partkey", "p_name", "p_mfgr", "p_category", "p_brand1", "p_color", "p_type", "p_size", "p_container"};
+	std::string supplier[] = {"s_suppkey", "s_name", "s_address", "s_city", "s_nation", "s_region", "s_phone"};
+	std::string customer[] = {
+	    "c_custkey", "c_name", "c_address", "c_city", "c_nation", "c_region", "c_phone", "c_mktsegment"};
+	std::string date[] = {"d_datekey",
+	                      "d_date",
+	                      "d_dayofweek",
+	                      "d_month",
+	                      "d_year",
+	                      "d_yearmonthnum",
+	                      "d_yearmonth",
+	                      "d_daynuminweek",
+	                      "d_daynuminmonth",
+	                      "d_daynuminyear",
+	                      "d_sellingseason",
+	                      "d_lastdayinweekfl",
+	                      "d_lastdayinmonthfl",
+	                      "d_holidayfl",
+	                      "d_weekdayfl"};
+
+	if (col_name[0] == 'l') {
+		int index = index_of(lineorder, 17, col_name);
+		return "LINEORDER" + std::to_string(index);
+	} else if (col_name[0] == 's') {
+		int index = index_of(supplier, 7, col_name);
+		return "SUPPLIER" + std::to_string(index);
+	} else if (col_name[0] == 'c') {
+		int index = index_of(customer, 8, col_name);
+		return "CUSTOMER" + std::to_string(index);
+	} else if (col_name[0] == 'p') {
+		int index = index_of(part, 9, col_name);
+		return "PART" + std::to_string(index);
+	} else if (col_name[0] == 'd') {
+		int index = index_of(date, 15, col_name);
+		return "DDATE" + std::to_string(index);
+	} else if (col_name[0] == 't') {
+		// test columns
+		return "tile_based_" + col_name + "_";
+	} else {
+		std::cout << "Unknown column " << col_name << '\n';
+		exit(1);
+	}
+
+	return "";
+}
+
+template <typename T>
+T* loadColumn(int num_entries, std::string file_path) {
+	T*            h_col = new T[num_entries];
+	std::ifstream col_data(file_path.c_str(), std::ios::in | std::ios::binary);
+	if (!col_data) { return NULL; }
+
+	col_data.read((char*)h_col, num_entries * sizeof(T));
+	return h_col;
+}
+
+template <typename T>
+int storeColumn(int num_entries, T* h_col, std::string file_path) {
+	std::ofstream col_data(file_path.c_str(), std::ios::out | std::ios::binary);
+	if (!col_data) { return -1; }
+
+	col_data.write((char*)h_col, num_entries * sizeof(T));
+	return 0;
+}
+
+inline std::string get_tile_based_dir() {
+	const char* data_dir = std::getenv("FLS_DATA_DIR_PATH");
+	if (data_dir == nullptr) {
+		data_dir = "/home/ubuntu/fff/benchmark/data/";
+		// todo : remove upper line.
+		//		throw std::runtime_error("FLS_DATA_DIR_PATH IS NOT SET.");
+	}
+
+	std::string result = static_cast<std::string>(data_dir) + "binary/tile_based/";
+
+	return result;
+}
+
+inline std::string get_gen_file_path() {
+	std::string col_name = std::string("gen") + "_" + std::to_string(tile_based::num_bits);
+	std::string result   = get_tile_based_dir() + col_name + ".tle";
+
+	return result;
+}
+
+inline std::string get_binpack_file_path() {
+	std::string col_name = std::string("binpack") + "_" + std::to_string(tile_based::num_bits);
+	std::string result   = get_tile_based_dir() + col_name + ".tle";
+
+	return result;
+}
+
+inline std::string get_binfos_file_path() {
+	std::string col_name = std::string("binofs") + "_" + std::to_string(tile_based::num_bits);
+	std::string result   = get_tile_based_dir() + col_name + ".tle";
+
+	return result;
+}
+
+/***
+ * Loads encoding from disk into memory
+ * encoding: bin | dbin
+ **/
+inline encoded_column loadEncodedColumn(int num_entries) {
+	// Open file
+	std::string filename         = get_binpack_file_path();
+	std::string offsets_filename = get_binfos_file_path();
+	int         fd               = open(filename.c_str(), O_RDONLY);
+
+	// Get size of file
+	struct stat s;
+	int         status   = fstat(fd, &s);
+	int         filesize = s.st_size;
+
+	encoded_column col;
+
+	std::ifstream col_data(filename.c_str(), std::ios::in | std::ios::binary);
+	if (!col_data) {
+		std::cout << "Unable to open encoded column file" << filename << std::endl;
+		exit(1);
+	}
+
+	col.data = new uint[filesize / 4];
+	col_data.read((char*)col.data, filesize);
+	col_data.close();
+
+	col.data_size = filesize;
+
+	int block_size      = 128;
+	int elem_per_thread = 4;
+	int tile_size       = block_size * elem_per_thread;
+	int adjusted_len    = ((num_entries + tile_size - 1) / tile_size) * tile_size;
+	int num_blocks      = adjusted_len / block_size;
+
+	col.block_start = new uint[num_blocks + 1];
+
+	std::ifstream offsets_data(offsets_filename.c_str(), std::ios::in | std::ios::binary);
+	if (!offsets_data) {
+		std::cout << "Unable to open encoded column file" << offsets_filename << std::endl;
+		exit(1);
+	}
+
+	offsets_data.read((char*)col.block_start, (num_blocks + 1) * sizeof(int));
+	offsets_data.close();
+
+	return col;
+}
+} // namespace tile_based
+#endif // FASTLANES_FILEFORMAT_CONFIG_HPP
diff --git a/tile_based/src/include/binpack_kernel.cuh b/tile_based/src/include/binpack_kernel.cuh
new file mode 100644
index 0000000..f7c8761
--- /dev/null
+++ b/tile_based/src/include/binpack_kernel.cuh
@@ -0,0 +1,84 @@
+#pragma once
+
+__forceinline__ __device__ int decodeElement(int i, uint miniblock_index, uint index_into_miniblock, uint* data_block, uint* bitwidths, uint* offsets) {
+  // Reference for the frame
+  int reference = reinterpret_cast<int*>(data_block)[0];
+
+  uint miniblock_offset = offsets[miniblock_index];
+  uint bitwidth = bitwidths[miniblock_index];
+
+  uint start_bitindex = (bitwidth * index_into_miniblock);
+  uint start_intindex = 2 + (start_bitindex >> 5);
+
+  start_bitindex = start_bitindex & (32-1);
+
+  unsigned long long element_block = (((unsigned long long)data_block[miniblock_offset + start_intindex + 1]) << 32) | data_block[miniblock_offset + start_intindex];
+  uint element = (element_block >> start_bitindex) & ((1LL<<bitwidth) - 1LL);
+
+  return reference + element;
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__forceinline__ __device__ void LoadBinPack(uint* block_start,
+    uint* data, uint* shared_buffer, uint32_t (&items)[ITEMS_PER_THREAD]) {
+  int tile_idx = blockIdx.x;
+  int threadId = threadIdx.x;
+
+  // Block start indices of 5 blocks converted into integer offsets.
+  uint *block_starts = &shared_buffer[0];
+  if (threadId < ITEMS_PER_THREAD + 1) {
+    block_starts[threadIdx.x] = block_start[tile_idx * ITEMS_PER_THREAD + threadIdx.x];
+  }
+  __syncthreads();
+
+  // Shared memory for 4 blocks of encoded l_shipdate data
+  // 5 + 32
+  uint* data_block = &shared_buffer[ITEMS_PER_THREAD + 1 + (ITEMS_PER_THREAD << 3)];
+
+  // Lets load 4 blocks from the encoded column
+  uint start_offset = block_starts[0];
+  uint end_offset = block_starts[ITEMS_PER_THREAD];
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+    uint index = start_offset + threadIdx.x + (i << 7); // i * 128
+    if (index < end_offset)
+      data_block[threadIdx.x + (i << 7)] = data[index];
+  }
+  __syncthreads();
+
+  uint* bitwidths = &shared_buffer[ITEMS_PER_THREAD + 1];
+  uint* offsets = &shared_buffer[ITEMS_PER_THREAD + 1 + (ITEMS_PER_THREAD << 2)];
+
+  if (threadId < (ITEMS_PER_THREAD << 2)) {
+    int i = threadId >> 2;
+    int miniblock_index = threadId & 3;
+
+    // Miniblock bitwidths
+    uint miniblock_bitwidths = *(data_block + block_starts[i] - block_starts[0] + 1);
+
+    // Miniblock bitwidth
+    uint miniblock_offsets = (miniblock_bitwidths << 8) + (miniblock_bitwidths << 16) + (miniblock_bitwidths << 24);
+    uint miniblock_offset = (miniblock_offsets >> (miniblock_index << 3)) & 255;
+    uint bitwidth = (miniblock_bitwidths >> (miniblock_index << 3)) & 255;
+
+    offsets[threadId] = miniblock_offset;
+    bitwidths[threadId] = bitwidth;
+  }
+  __syncthreads();
+
+  // Index of miniblock containing i
+  uint miniblock_index = threadIdx.x >> 5; // i / 32
+
+  // Entry index in the miniblock
+  uint index_into_miniblock = threadIdx.x & (32 - 1);
+
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+    /*if (is_last_tile) {*/
+      /*if (threadIdx.x + i*128 < num_tile_items) {*/
+        /*items[i] = decodeElement(threadIdx.x, data_block + block_starts[i] - block_starts[0]);*/
+      /*}*/
+    /*}*/
+    /*else {*/
+      items[i] = decodeElement(threadIdx.x, miniblock_index, index_into_miniblock, data_block + block_starts[i] - block_starts[0], bitwidths + (i<<2), offsets + (i<<2));
+    /*}*/
+  }
+}
diff --git a/tile_based/src/include/crystal/crystal.cuh b/tile_based/src/include/crystal/crystal.cuh
new file mode 100644
index 0000000..ddce5b8
--- /dev/null
+++ b/tile_based/src/include/crystal/crystal.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+// Block-wide functions
+#include "load.cuh"
+#include "pred.cuh"
+#include "store.cuh"
+#include "reduce.cuh"
+#include "join.cuh"
+
diff --git a/tile_based/src/include/crystal/join.cuh b/tile_based/src/include/crystal/join.cuh
new file mode 100644
index 0000000..eabb74c
--- /dev/null
+++ b/tile_based/src/include/crystal/join.cuh
@@ -0,0 +1,311 @@
+#pragma once
+
+#define HASH(X,Y,Z) ((X-Z) % Y)
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(items[ITEM], ht_len, keys_min);
+
+      K slot = ht[hash];
+      if (slot != 0) {
+        selection_flags[ITEM] = 1;
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K slot = ht[hash];
+        if (slot != 0) {
+          selection_flags[ITEM] = 1;
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_1(
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+      if (slot != 0) {
+        res[ITEM] = (slot >> 32);
+      } else {
+        selection_flags[ITEM] = 0;
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
+        if (slot != 0) {
+          res[ITEM] = (slot >> 32);
+        } else {
+          selection_flags[ITEM] = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockProbeAndPHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockProbeAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash], 0, keys[ITEM]);
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
+    int tid,
+    K  (&items)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(items[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash], 0, items[ITEM]);
+      }
+    }
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_1(
+    K  (&keys)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, selection_flags, ht, ht_len, 0, num_items);
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (selection_flags[ITEM]) {
+      int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+      K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+      ht[(hash << 1) + 1] = res[ITEM];
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
+    int tid,
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      if (selection_flags[ITEM]) {
+        int hash = HASH(keys[ITEM], ht_len, keys_min);
+
+        K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
+        ht[(hash << 1) + 1] = res[ITEM];
+      }
+    }
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    K keys_min,
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
+  } else {
+    BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
+        threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
+  }
+}
+
+template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockBuildSelectivePHT_2(
+    K  (&keys)[ITEMS_PER_THREAD],
+    V  (&res)[ITEMS_PER_THREAD],
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    K* ht,
+    int ht_len,
+    int num_items
+    ) {
+  BlockBuildSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
+}
diff --git a/tile_based/src/include/crystal/load.cuh b/tile_based/src/include/crystal/load.cuh
new file mode 100644
index 0000000..bf18fe8
--- /dev/null
+++ b/tile_based/src/include/crystal/load.cuh
@@ -0,0 +1,97 @@
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    const unsigned int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = inp;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoadDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockLoad(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockLoadDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockLoadDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
diff --git a/tile_based/src/include/crystal/pred.cuh b/tile_based/src/include/crystal/pred.cuh
new file mode 100644
index 0000000..491f96e
--- /dev/null
+++ b/tile_based/src/include/crystal/pred.cuh
@@ -0,0 +1,335 @@
+#pragma once
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void InitFlags(
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = 1;
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPred(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = selection_flags[ITEM] && select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAnd(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredAndDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD]
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrDirect(
+    int tid,
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      selection_flags[ITEM] = selection_flags[ITEM] || select_op(items[ITEM]);
+    }
+  }
+}
+
+template<typename T, typename SelectOp, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOr(
+    T  (&items)[ITEMS_PER_THREAD],
+    SelectOp select_op,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags);
+  } else {
+    BlockPredOrDirect<T, SelectOp, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, select_op, selection_flags, num_items);
+  }
+}
+
+template<typename T>
+struct LessThan
+{
+  T compare;
+
+  __device__ __forceinline__
+  LessThan(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a < compare);
+  }
+};
+
+template<typename T>
+struct GreaterThan
+{
+  T compare;
+
+  __device__ __forceinline__
+  GreaterThan(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a > compare);
+  }
+};
+
+template<typename T>
+struct LessThanEq
+{
+  T compare;
+
+  __device__ __forceinline__
+  LessThanEq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a <= compare);
+  }
+};
+
+template<typename T>
+struct GreaterThanEq
+{
+  T compare;
+
+  __device__ __forceinline__
+  GreaterThanEq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a >= compare);
+  }
+};
+
+template<typename T>
+struct Eq
+{
+  T compare;
+
+  __device__ __forceinline__
+  Eq(T compare) : compare(compare) {}
+
+  __device__ __forceinline__
+  bool operator()(const T &a) const {
+    return (a == compare);
+  }
+};
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThan<T> select_op(compare);
+  BlockPred<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThan<T> select_op(compare);
+  BlockPredAnd<T, LessThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredGT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThan<T> select_op(compare);
+  BlockPred<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndGT(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThan<T> select_op(compare);
+  BlockPredAnd<T, GreaterThan<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredLTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThanEq<T> select_op(compare);
+  BlockPred<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndLTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  LessThanEq<T> select_op(compare);
+  BlockPredAnd<T, LessThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredGTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThanEq<T> select_op(compare);
+  BlockPred<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndGTE(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  GreaterThanEq<T> select_op(compare);
+  BlockPredAnd<T, GreaterThanEq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPred<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredAndEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPredAnd<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockPredOrEQ(
+    T  (&items)[ITEMS_PER_THREAD],
+    T compare,
+    int  (&selection_flags)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  Eq<T> select_op(compare);
+  BlockPredOr<T, Eq<T>, BLOCK_THREADS, ITEMS_PER_THREAD>(items, select_op, selection_flags, num_items);
+}
+
diff --git a/tile_based/src/include/crystal/reduce.cuh b/tile_based/src/include/crystal/reduce.cuh
new file mode 100644
index 0000000..66de689
--- /dev/null
+++ b/tile_based/src/include/crystal/reduce.cuh
@@ -0,0 +1,45 @@
+#pragma once
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(T item, T* shared) {
+	__syncthreads();
+
+	T         val       = item;
+	const int warp_size = 32;
+	int       lane      = threadIdx.x % warp_size;
+	int       wid       = threadIdx.x / warp_size;
+
+	// Calculate sum across warp
+	for (int offset = 16; offset > 0; offset /= 2) {
+		val += __shfl_down_sync(0xffffffff, val, offset);
+	}
+
+	// Store sum in buffer
+	if (lane == 0) { shared[wid] = val; }
+
+	__syncthreads();
+
+	// Load the sums into the first warp
+	val = (threadIdx.x < blockDim.x / warp_size) ? shared[lane] : 0;
+
+	// Calculate sum of sums
+	if (wid == 0) {
+		for (int offset = 16; offset > 0; offset /= 2) {
+			val += __shfl_down_sync(0xffffffff, val, offset);
+		}
+	}
+
+	return val;
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T BlockSum(T (&items)[ITEMS_PER_THREAD], T* shared) {
+	T thread_sum = 0;
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+		thread_sum += items[ITEM];
+	}
+
+	return BlockSum(thread_sum, shared);
+}
diff --git a/tile_based/src/include/crystal/store.cuh b/tile_based/src/include/crystal/store.cuh
new file mode 100644
index 0000000..a99d5b4
--- /dev/null
+++ b/tile_based/src/include/crystal/store.cuh
@@ -0,0 +1,98 @@
+#pragma once
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      thread_itr[ITEM * BLOCK_THREADS] = items[ITEM];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* out,
+    T  (&items)[ITEMS_PER_THREAD],
+    int num_items
+    ) {
+  T* block_itr = out;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect<T, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#if 0
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStoreDirect(
+    int tid,
+    T* block_itr,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* thread_itr = block_itr + tid;
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
+    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
+      items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+  }
+}
+
+template<typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void BlockStore(
+    T* inp,
+    T  (&items)[ITEMS_PER_THREAD]
+    int num_items
+    ) {
+  T* block_itr = inp + blockIdx.x * blockDim.x;
+
+  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
+    BlockStoreDirect(threadIdx.x, block_itr, items);
+  } else {
+    BlockStoreDirect(threadIdx.x, block_itr, items, num_items);
+  }
+}
+
+#endif
+
diff --git a/tile_based/src/include/deltabinpack_kernel.cuh b/tile_based/src/include/deltabinpack_kernel.cuh
new file mode 100644
index 0000000..bf1c60c
--- /dev/null
+++ b/tile_based/src/include/deltabinpack_kernel.cuh
@@ -0,0 +1,103 @@
+#pragma once
+#include <cub/cub.cuh>
+using namespace cub;
+
+__forceinline__ __device__ int decodeElementDBin(int i, uint* data_block) {
+	// Reference for the frame
+	int reference = reinterpret_cast<int*>(data_block)[0];
+
+	// Index of miniblock containing i
+	uint miniblock_index = i/32;
+
+	// Miniblock bitwidths
+	uint miniblock_bitwidths = data_block[1];
+
+	// Miniblock offset into data_block array
+	uint miniblock_offset = 0;
+	for (int j=0; j<miniblock_index; j++) {
+		miniblock_offset += (miniblock_bitwidths & 255);
+		miniblock_bitwidths >>= 8;
+	}
+
+	// Miniblock bitwidth
+	uint bitwidth = miniblock_bitwidths & 255;
+
+	// Entry index in the miniblock
+	uint index_into_miniblock = i & (32 - 1);
+
+	uint start_bitindex = (bitwidth * index_into_miniblock);
+	uint start_intindex = 2 + start_bitindex/32;
+
+	unsigned long long element_block = (((unsigned long long)data_block[miniblock_offset + start_intindex + 1]) << 32) | data_block[miniblock_offset + start_intindex];
+	start_bitindex = start_bitindex & (32-1);
+
+	uint element = (element_block & (((1LL<<bitwidth) - 1LL) << start_bitindex)) >> start_bitindex;
+
+	return reference + element;
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__forceinline__ __device__ void LoadDBinPack(uint* block_start,
+                                             uint* data, uint* shared_buffer, int (&items)[4], bool is_last_tile, int num_tile_items) {
+	typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+
+	// Specialize BlockScan for a 1D block of 128 threads on type int
+	typedef cub::BlockScan<int, 128> BlockScan;
+
+	int tile_idx = blockIdx.x;
+
+	// Block start indices of 5 blocks converted into integer offsets.
+	uint *block_starts = &shared_buffer[0];
+	if (threadIdx.x < 5) {
+		block_starts[threadIdx.x] = block_start[tile_idx * 4 + threadIdx.x];
+	}
+	__syncthreads();
+
+	// Shared memory for 4 blocks of encoded l_shipdate data
+	uint* data_block = &shared_buffer[5];
+
+	// Lets load 4 blocks from the encoded column
+	uint start_offset = block_starts[0] - 1;
+	uint end_offset = block_starts[4];
+	for (int i=0; i<4; i++) {
+		uint index = start_offset + threadIdx.x + i*128;
+		if (index < end_offset)
+			data_block[threadIdx.x + i*128] = data[index];
+	}
+	__syncthreads();
+
+	int first_value = data_block[0];
+	data_block = data_block + 1;
+
+	for (int i=0; i<4; i++) {
+		if (is_last_tile) {
+			if (threadIdx.x + i*128 < num_tile_items) {
+				items[i] = decodeElementDBin(threadIdx.x, data_block + block_starts[i] - block_starts[0]);
+			}
+		}
+		else {
+			items[i] = decodeElementDBin(threadIdx.x, data_block + block_starts[i] - block_starts[0]);
+		}
+	}
+
+	if (threadIdx.x == 0) {
+		items[0] = first_value;
+	}
+
+	__syncthreads();
+
+	typename BlockScan::TempStorage *temp_storage_scan = reinterpret_cast<typename BlockScan::TempStorage*>(shared_buffer);
+
+	typename BlockExchange::TempStorage *temp_storage_exchange = reinterpret_cast<typename BlockExchange::TempStorage*>(shared_buffer);
+
+	BlockExchange(*temp_storage_exchange).StripedToBlocked(items);
+
+	__syncthreads();
+
+	// Also accepts an initial value
+	BlockScan(*temp_storage_scan).InclusiveSum(items, items);
+
+	__syncthreads();
+
+	BlockExchange(*temp_storage_exchange).BlockedToStriped(items);
+}
\ No newline at end of file
diff --git a/tile_based/src/include/econfig.h b/tile_based/src/include/econfig.h
new file mode 100644
index 0000000..0952dff
--- /dev/null
+++ b/tile_based/src/include/econfig.h
@@ -0,0 +1,8 @@
+// Encoding Configuration
+#include "kernel.cuh"
+#define ENCODING "dbin"
+#define ENCODINGKERNEL LoadDBinPack
+/*#define ENCODING "bin"*/
+/*#define ENCODINGKERNEL LoadBinPack*/
+/*#define ENCODING "rbin"*/
+/*#define ENCODINGKERNEL LoadRBinPack*/
\ No newline at end of file
diff --git a/tile_based/src/include/kernel.cuh b/tile_based/src/include/kernel.cuh
new file mode 100644
index 0000000..5034749
--- /dev/null
+++ b/tile_based/src/include/kernel.cuh
@@ -0,0 +1,5 @@
+#include "deltabinpack_kernel.cuh"
+#include "binpack_kernel.cuh"
+// #include "rlebinpack_kernel.cuh"
+// #include "simplebinpack_kernel.cuh"
+// #include "simpledeltabinpack_kernel.cuh"
\ No newline at end of file
diff --git a/tile_based/src/include/rlebinpack_kernel.cuh b/tile_based/src/include/rlebinpack_kernel.cuh
new file mode 100644
index 0000000..73abcc7
--- /dev/null
+++ b/tile_based/src/include/rlebinpack_kernel.cuh
@@ -0,0 +1,146 @@
+#pragma once
+#include <cub/cub.cuh>
+using namespace cub;
+
+__forceinline__ __device__ int decodeElementRBin(int i, uint* data_block, uint reference, uint bitwidth) {
+
+  uint start_bitindex = (bitwidth * i);
+  uint start_intindex = (start_bitindex >> 5); // 3 for reference bitwidth and count
+
+  start_bitindex = start_bitindex & (32-1);
+
+  /*unsigned long long element_block = *((unsigned long long*)&data_block[miniblock_offset + start_intindex]);*/
+  unsigned long long element_block = (((unsigned long long)data_block[start_intindex + 1]) << 32) | data_block[start_intindex];
+  uint element = (element_block >> start_bitindex) & ((1LL<<bitwidth) - 1LL);
+
+  return reference + element;
+}
+
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__forceinline__ __device__ void LoadRBinPack(uint* val_block_start, uint* rl_block_start,
+    uint* value, uint* run_length, uint* shared_buffer, int (&items_value)[ITEMS_PER_THREAD],
+    int (&items_run_length)[ITEMS_PER_THREAD], bool is_last_tile, int num_tile_items) {
+
+  typedef cub::BlockExchange<int, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchange;
+
+  // Specialize BlockScan for a 1D block of 128 threads on type int
+  typedef cub::BlockScan<int, BLOCK_THREADS> BlockScan;
+
+  uint num_decode;
+
+  int tile_idx = blockIdx.x;
+
+  // Block start indices of 5 blocks converted into integer offsets.
+  uint *val_block_starts = &shared_buffer[0];
+  uint *rl_block_starts = &shared_buffer[BLOCK_THREADS * ITEMS_PER_THREAD];
+  if (threadIdx.x  < 2) {
+    val_block_starts[threadIdx.x] = val_block_start[tile_idx + threadIdx.x];
+    rl_block_starts[threadIdx.x] = rl_block_start[tile_idx + threadIdx.x];
+  }
+
+  __syncthreads();
+
+  // Shared memory for 4 blocks of encoded l_shipdate data
+  uint* val_data_block = &val_block_starts[0];
+  uint* rl_data_block = &rl_block_starts[0];
+
+  // // Lets load 4 blocks from the encoded column
+  uint start_offset_val = val_block_starts[0];
+  uint end_offset_val = val_block_starts[1];
+  uint start_offset_rl = rl_block_starts[0];
+  uint end_offset_rl = rl_block_starts[1];
+
+  __syncthreads();
+
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+    uint index = start_offset_val + threadIdx.x + (i * BLOCK_THREADS); // i * 128
+    if (index < end_offset_val) {
+      val_data_block[threadIdx.x + (i * BLOCK_THREADS)] = value[index];
+    }
+    index = start_offset_rl + threadIdx.x + (i * BLOCK_THREADS); // i * 128
+    if (index < end_offset_rl) {
+      rl_data_block[threadIdx.x + (i * BLOCK_THREADS)] = run_length[index];
+    }
+  }
+
+  __syncthreads();
+
+  uint count = val_data_block[2]; // == rl_ptr[2]
+  uint offset = 0;
+  num_decode = ((count + ITEMS_PER_THREAD - 1) / ITEMS_PER_THREAD);
+
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+
+    uint* val_ptr = val_data_block + 3;
+    uint* rl_ptr = rl_data_block + 3;
+
+    uint reference, bitwidth;
+    if (threadIdx.x < num_decode && threadIdx.x + offset < count) {
+      reference = val_data_block[0];
+      bitwidth = val_data_block[1] & 255;
+      items_value[i] = decodeElementRBin(threadIdx.x + offset, val_ptr, reference, bitwidth);
+      reference = rl_data_block[0];
+      bitwidth = rl_data_block[1] & 255;
+      items_run_length[i] = decodeElementRBin(threadIdx.x + offset, rl_ptr, reference, bitwidth);
+    } else {
+      items_value[i] = 0;
+      items_run_length[i] = 0;
+    }
+
+    offset += num_decode;
+
+  }
+
+  __syncthreads();
+
+  typename BlockScan::TempStorage *temp_storage_scan = reinterpret_cast<typename BlockScan::TempStorage*>(rl_data_block);
+  typename BlockExchange::TempStorage *temp_storage_exchange = reinterpret_cast<typename BlockExchange::TempStorage*>(rl_data_block);
+
+  BlockExchange(*temp_storage_exchange).StripedToBlocked(items_run_length);
+
+  __syncthreads();
+
+  /*// Also accepts an initial value.*/
+  BlockScan(*temp_storage_scan).InclusiveSum(items_run_length, items_run_length);
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    val_data_block[threadIdx.x * ITEMS_PER_THREAD + i] = 0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    val_data_block[items_run_length[i]] = 1;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    items_run_length[i] = val_data_block[threadIdx.x * ITEMS_PER_THREAD + i];
+  }
+
+  __syncthreads();
+
+  BlockScan(*temp_storage_scan).InclusiveSum(items_run_length, items_run_length);
+
+  __syncthreads();
+
+  BlockExchange(*temp_storage_exchange).BlockedToStriped(items_run_length);
+
+  __syncthreads();
+
+
+  offset = 0;
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    if (threadIdx.x < num_decode) val_data_block[threadIdx.x + offset] = items_value[i];
+    offset += num_decode;
+  }
+
+  __syncthreads();
+
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    items_value[i] = val_data_block[items_run_length[i]];
+  }
+}
\ No newline at end of file
diff --git a/tile_based/src/include/ssb_gpu_utils.h b/tile_based/src/include/ssb_gpu_utils.h
new file mode 100644
index 0000000..511d0a9
--- /dev/null
+++ b/tile_based/src/include/ssb_gpu_utils.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "ssb_utils.h"
+//#include "cub/test/test_util.h"
+using namespace cub;
+
+template<typename T>
+T* loadColumnToGPU(T* src, int len, CachingDeviceAllocator& g_allocator) {
+  T* dest = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &dest, sizeof(T) * len));
+  CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * len, cudaMemcpyHostToDevice));
+  return dest;
+}
+
+encoded_column loadEncodedColumnToGPU(string col_name, string encoding, int len, CachingDeviceAllocator& g_allocator) {
+  if (!(encoding == "bin" || encoding == "dbin" || encoding == "pbin")) {
+    cout << "Encoding has to be bin or dbin" << endl;
+    exit(1);
+  }
+
+  encoded_column h_col = loadEncodedColumn(col_name, encoding, len);
+
+  int block_size = 128;
+  int elem_per_thread = 4;
+  int tile_size = block_size * elem_per_thread;
+  int adjusted_len = ((len + tile_size - 1)/tile_size) * tile_size;
+  int num_blocks = adjusted_len / block_size;
+
+  uint* d_col_block_start = loadColumnToGPU<uint>(h_col.block_start, num_blocks + 1, g_allocator);
+  uint* d_col_data = loadColumnToGPU<uint>(h_col.data, h_col.data_size/4, g_allocator);
+
+  cout << "Encoded Col Size: " << h_col.data_size << " " << num_blocks + 1 << endl;
+
+  encoded_column d_col;
+  d_col.block_start = d_col_block_start;
+  d_col.data = d_col_data;
+  return d_col;
+}
+
+encoded_column loadEncodedColumnToGPURLE(string col_name, string encoding, int len, CachingDeviceAllocator& g_allocator) {
+  if (!(encoding == "valbin" || encoding == "rlbin")) {
+    cout << "Encoding has to be valbin or rlbin" << endl;
+    exit(1);
+  }
+
+  encoded_column h_col = loadEncodedColumnRLE(col_name, encoding, len);
+
+  // for (int i = 0; i < 5; i++) {
+  //   cout << h_col.block_start[i] << endl;
+  // }
+
+  int block_size = 512;
+  int elem_per_thread = 1; //the only difference for RLE
+  int tile_size = block_size * elem_per_thread;
+  int adjusted_len = ((len + tile_size - 1)/tile_size) * tile_size;
+  int num_blocks = adjusted_len / block_size;
+
+  uint* d_col_block_start = loadColumnToGPU<uint>(h_col.block_start, num_blocks + 1, g_allocator);
+  uint* d_col_data = loadColumnToGPU<uint>(h_col.data, h_col.data_size/4, g_allocator);
+
+  cout << "Encoded Col Size: " << h_col.data_size << " " << num_blocks + 1 << " " << h_col.data_size + num_blocks + 1 << endl;
+
+  encoded_column d_col;
+  d_col.block_start = d_col_block_start;
+  d_col.data = d_col_data;
+  return d_col;
+}
\ No newline at end of file
diff --git a/tile_based/src/include/ssb_utils.h b/tile_based/src/include/ssb_utils.h
new file mode 100644
index 0000000..dbf1744
--- /dev/null
+++ b/tile_based/src/include/ssb_utils.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <assert.h>
+#include <string>
+
+/*#include <cuda.h>*/
+/*#include <cub/util_allocator.cuh>*/
+
+using namespace std;
+//using namespace cub;
+
+#define SF 10 // FLS_CHG
+
+#define BASE_PATH "/home/ubuntu/fff/gpu/data/ssb/data/" // FLS_CHG
+
+#if SF == 1
+#define DATA_DIR BASE_PATH "s1_columnar/"
+#define LO_LEN 6001171
+#define P_LEN 200000
+#define S_LEN 2000
+#define C_LEN 30000
+#define D_LEN 2556
+#elif SF == 10
+#define DATA_DIR BASE_PATH "s10_columnar/"
+#define LO_LEN 59986214
+#define P_LEN 800000
+#define S_LEN 20000
+#define C_LEN 300000
+#define D_LEN 2556
+#else // 20
+#define DATA_DIR BASE_PATH "s20_columnar/"
+#define LO_LEN 119994368
+//#define LO_LEN 119994746
+#define P_LEN 1000000
+#define S_LEN 40000
+#define C_LEN 600000
+#define D_LEN 2556
+#endif
+
+int index_of(string* arr, int len, string val) {
+  for (int i=0; i<len; i++)
+    if (arr[i] == val)
+      return i;
+
+  return -1;
+}
+
+// 16 / 6 / 7 / 8 - not integer columns
+string lookup(string col_name) {
+  string lineorder[] = { "lo_orderkey", "lo_linenumber", "lo_custkey", "lo_partkey", "lo_suppkey", "lo_orderdate", "lo_orderpriority", "lo_shippriority", "lo_quantity", "lo_extendedprice", "lo_ordtotalprice", "lo_discount", "lo_revenue", "lo_supplycost", "lo_tax", "lo_commitdate", "lo_shipmode"};
+  string part[] = {"p_partkey", "p_name", "p_mfgr", "p_category", "p_brand1", "p_color", "p_type", "p_size", "p_container"};
+  string supplier[] = {"s_suppkey", "s_name", "s_address", "s_city", "s_nation", "s_region", "s_phone"};
+  string customer[] = {"c_custkey", "c_name", "c_address", "c_city", "c_nation", "c_region", "c_phone", "c_mktsegment"};
+  string date[] = {"d_datekey", "d_date", "d_dayofweek", "d_month", "d_year", "d_yearmonthnum", "d_yearmonth", "d_daynuminweek", "d_daynuminmonth", "d_daynuminyear", "d_sellingseason", "d_lastdayinweekfl", "d_lastdayinmonthfl", "d_holidayfl", "d_weekdayfl"};
+
+  if (col_name[0] == 'l') {
+    int index = index_of(lineorder, 17, col_name);
+    return "LINEORDER" + to_string(index);
+  } else if (col_name[0] == 's') {
+    int index = index_of(supplier, 7, col_name);
+    return "SUPPLIER" + to_string(index);
+  } else if (col_name[0] == 'c') {
+    int index = index_of(customer, 8, col_name);
+    return "CUSTOMER" + to_string(index);
+  } else if (col_name[0] == 'p') {
+    int index = index_of(part, 9, col_name);
+    return "PART" + to_string(index);
+  } else if (col_name[0] == 'd') {
+    int index = index_of(date, 15, col_name);
+    return "DDATE" + to_string(index);
+  } else if (col_name[0] == 't') {
+    // test columns
+    return "../../../bench/data/" + col_name;
+  } else {
+    cout << "Unknown column " << col_name << endl;
+    exit(1);
+  }
+
+  return "";
+}
+
+template<typename T>
+T* loadColumn(string col_name, int num_entries) {
+  T* h_col = new T[num_entries];
+  string filename = DATA_DIR + lookup(col_name);
+  ifstream colData (filename.c_str(), ios::in | ios::binary);
+  if (!colData) {
+    throw std::runtime_error(filename.c_str()); // FLS_CHG
+    // return NULL; // FLS_CHG
+  }
+
+  colData.read((char*)h_col, num_entries * sizeof(T));
+  return h_col;
+}
+
+template<typename T>
+int storeColumn(string col_name, int num_entries, T* h_col) {
+  string filename = DATA_DIR + lookup(col_name);
+  ofstream colData (filename.c_str(), ios::out | ios::binary);
+  if (!colData) {
+    return -1;
+  }
+
+  colData.write((char*)h_col, num_entries * sizeof(T));
+  return 0;
+}
+
+struct encoded_column {
+  // block_start[i] = byte at which ith block starts
+  uint* block_start;
+  // raw data
+  uint* data;
+  // number of bytes of raw data
+  int data_size;
+};
+
+/***
+ * Loads encoding from disk into memory
+ * encoding: bin | dbin
+ **/
+encoded_column loadEncodedColumn(string col_name, string encoding, int num_entries) {
+  if (!(encoding == "bin" || encoding == "dbin" || encoding == "pbin")) {
+    cout << "Encoding has to be bin or dbin" << endl;
+    exit(1);
+  }
+
+  // Open file
+  string filename = DATA_DIR + lookup(col_name) + "." + encoding;
+  string offsets_filename = DATA_DIR + lookup(col_name) + "." + encoding + "off";
+
+  // int fd = open(filename.c_str(), O_RDONLY);
+
+  // Get size of file
+  struct stat s;
+  // int status = fstat(fd, &s);
+  int filesize = s.st_size;
+
+  encoded_column col;
+
+  ifstream colData (filename.c_str(), ios::in | ios::binary);
+  if (!colData) {
+    cout << "Unable to open encoded column file" << filename << endl;
+    exit(1);
+  }
+
+  col.data = new uint[filesize / 4];
+  colData.read((char*)col.data, filesize);
+  colData.close();
+
+  col.data_size = filesize;
+
+  int block_size = 128;
+  int elem_per_thread = 4;
+  int tile_size = block_size * elem_per_thread;
+  int adjusted_len = ((num_entries + tile_size - 1)/tile_size) * tile_size;
+  int num_blocks = adjusted_len / block_size;
+
+  col.block_start = new uint[num_blocks + 1];
+
+  ifstream offsetsData (offsets_filename.c_str(), ios::in | ios::binary);
+  if (!offsetsData) {
+    cout << "Unable to open encoded column file" << offsets_filename << endl;
+    exit(1);
+  }
+
+  offsetsData.read((char*)col.block_start, (num_blocks + 1) * sizeof(int));
+  offsetsData.close();
+
+  return col;
+}
+
+encoded_column loadEncodedColumnRLE(string col_name, string encoding, int num_entries) {
+  if (!(encoding == "valbin" || encoding == "rlbin")) {
+    cout << "Encoding has to be valbin or rlbin" << endl;
+    exit(1);
+  }
+
+  // Open file
+  string filename = DATA_DIR + lookup(col_name) + "." + encoding;
+  string offsets_filename = DATA_DIR + lookup(col_name) + "." + encoding + "off";
+
+  // int fd = open(filename.c_str(), O_RDONLY);
+
+  // Get size of file
+  struct stat s;
+  // int status = fstat(fd, &s);
+  int filesize = s.st_size;
+
+  encoded_column col;
+
+  ifstream colData (filename.c_str(), ios::in | ios::binary);
+  if (!colData) {
+    cout << "Unable to open encoded column file" << filename << endl;
+    exit(1);
+  }
+
+  col.data = new uint[filesize / 4];
+  colData.read((char*)col.data, filesize);
+  colData.close();
+
+  col.data_size = filesize;
+
+  int block_size = 512;
+  int elem_per_thread = 1; //the only difference for RLE
+  int tile_size = block_size * elem_per_thread;
+  int adjusted_len = ((num_entries + tile_size - 1)/tile_size) * tile_size;
+  int num_blocks = adjusted_len / block_size;
+
+  col.block_start = new uint[num_blocks + 1];
+
+  ifstream offsetsData (offsets_filename.c_str(), ios::in | ios::binary);
+  if (!offsetsData) {
+    cout << "Unable to open encoded column file" << offsets_filename << endl;
+    exit(1);
+  }
+
+  offsetsData.read((char*)col.block_start, (num_blocks + 1) * sizeof(int));
+  offsetsData.close();
+
+  return col;
+}
+
+/*int main() {*/
+  //int *h_col = new int[10];
+  //for (int i=0; i<10; i++) h_col[i] = i;
+  //storeColumn<int>("test", 10, h_col);
+  //int *l_col = loadColumn<int>("test", 10);
+  //for (int i=0; i<10; i++) cout << l_col[i] << " ";
+  //cout << endl;
+  //return 0;
+/*}*/
+
diff --git a/tile_based/src/include/utils/gpu_utils.h b/tile_based/src/include/utils/gpu_utils.h
new file mode 100644
index 0000000..3347717
--- /dev/null
+++ b/tile_based/src/include/utils/gpu_utils.h
@@ -0,0 +1,17 @@
+#pragma once
+
+
+
+#define SETUP_TIMING() cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
+
+#define TIME_FUNC(f,t) { \
+cudaEventRecord(start, 0); \
+f; \
+cudaEventRecord(stop, 0); \
+cudaEventSynchronize(stop); \
+cudaEventElapsedTime(&t, start,stop); \
+}
+
+#define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec))
+
+#define ALLOCATE(vec,size) CubDebugExit(g_allocator.DeviceAllocate((void**)&vec, size))
\ No newline at end of file
diff --git a/tile_based/src/rlebinpack.cpp b/tile_based/src/rlebinpack.cpp
new file mode 100644
index 0000000..8c098aa
--- /dev/null
+++ b/tile_based/src/rlebinpack.cpp
@@ -0,0 +1,235 @@
+#include "ssb_utils.h"
+#include <stdio.h>
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+
+using namespace std;
+
+pair<uint, uint> rleBinPack(uint*&in, uint*& value, uint*& run_length, uint*& val_offsets, uint*& rl_offsets, uint num_entries) {
+  uint val_offset = 0;
+  uint rl_offset = 0;
+
+  uint block_size = 512;
+  uint elem_per_thread = 1;
+  uint tile_size = block_size * elem_per_thread;
+
+  //nonblock
+  block_size = tile_size;
+
+  uint miniblock_count = 4;
+  uint total_count = num_entries;
+  uint first_val = in[0];
+
+  value[0] = block_size;
+  value[1] = miniblock_count;
+  value[2] = total_count;
+  value[3] = first_val;
+
+  run_length[0] = block_size;
+  run_length[1] = miniblock_count;
+  run_length[2] = total_count;
+  run_length[3] = first_val;
+
+  val_offset += 4;
+  rl_offset += 4;
+
+  // uint num_tiles = (num_entries + tile_size - 1) / tile_size;
+
+  uint* val = new uint[tile_size]();
+  uint* rl = new uint[tile_size]();
+
+  for (uint tile_start=0; tile_start<num_entries; tile_start += tile_size) {
+    uint block_index = tile_start / block_size;
+
+    uint count = 0;
+    val[count] = in[0];
+    uint run = 1;
+    for (int i = 1; i < tile_size; i++) {
+      if (in[i] != in[i-1]) {
+        rl[count] = run;
+        count++;
+        val[count] = in[i];
+        run = 1;
+      } else {
+        run++;
+      }
+    }
+    rl[count] = run;
+    count++;
+
+    int bl_size = count;
+    int block_start = 0;
+
+    rl_offsets[block_index] = rl_offset;
+    val_offsets[block_index] = val_offset;
+
+    uint min_val = val[block_start];
+    uint min_rl = rl[block_start];
+    for (int i = 1; i < bl_size; i++) {
+      if (val[block_start + i] < min_val) min_val = val[block_start + i];
+      if (rl[block_start + i] < min_rl) min_rl = rl[block_start + i];
+    }
+
+    uint val_bitwidth = 0;
+    uint rl_bitwidth = 0;
+
+    for (int i = block_start; i < block_start + bl_size; i++) {
+      val[i] = val[i] - min_val;
+      rl[i] = rl[i] - min_rl;
+      uint bitwidth = uint(ceil(log2(val[i] + 1)));
+      val_bitwidth = max(val_bitwidth, bitwidth);
+      bitwidth = uint(ceil(log2(rl[i] + 1)));
+      rl_bitwidth = max(rl_bitwidth, bitwidth);
+    }
+
+    value[val_offset] = min_val;
+    run_length[rl_offset] = min_rl;
+    val_offset++; rl_offset++;
+
+    value[val_offset] = val_bitwidth + (val_bitwidth << 8) +
+      (val_bitwidth << 16) + (val_bitwidth << 24);
+    run_length[rl_offset] = rl_bitwidth + (rl_bitwidth << 8) +
+      (rl_bitwidth << 16) + (rl_bitwidth << 24);
+    // cout << value[val_offset] << " " << run_length[val_offset] << endl;
+    val_offset++; rl_offset++;
+
+    if (block_start == (bl_size * (elem_per_thread - 1))) { // if last block
+      value[val_offset] = count - bl_size * (elem_per_thread - 1);
+      run_length[rl_offset] = count - bl_size * (elem_per_thread - 1);
+    } else {
+      value[val_offset] = bl_size;
+      run_length[rl_offset] = bl_size;
+    }
+    val_offset++; rl_offset++;
+
+    uint bitwidth = val_bitwidth;
+    uint shift = 0;
+    for (int i = block_start; i < block_start + bl_size; i++) {
+      if (shift + bitwidth > 32) {
+        if (shift != 32) value[val_offset] += val[i] << shift;
+        val_offset++;
+        shift = (shift + bitwidth) & (32-1);
+        value[val_offset] = val[i] >> (bitwidth - shift);
+      } else {
+        value[val_offset] += val[i] << shift;
+        shift += bitwidth;
+      }
+    }
+    val_offset++;
+
+    bitwidth = rl_bitwidth;
+    shift = 0;
+    for (int i = block_start; i < block_start + bl_size; i++) {
+      if (shift + bitwidth > 32) {
+        if (shift != 32) run_length[rl_offset] += rl[i] << shift;
+        rl_offset++;
+        shift = (shift + bitwidth) & (32-1);
+        run_length[rl_offset] = rl[i] >> (bitwidth - shift);
+      } else {
+        run_length[rl_offset] += rl[i] << shift;
+        shift += bitwidth;
+      }
+    }
+    rl_offset++;
+
+    in += tile_size;
+
+  }
+
+  val_offsets[num_entries / block_size] = val_offset;
+  rl_offsets[num_entries / block_size] = rl_offset;
+
+  cout << val_offset << " " << rl_offset << endl;
+
+  return make_pair(val_offset, rl_offset);
+}
+
+int storeEncodedValueColumn(string col_name, uint* value, uint arr_byte_size, uint* val_offsets, uint num_blocks) {
+  string filename = DATA_DIR + lookup(col_name) + ".valbin";
+  ofstream colData (filename.c_str(), ios::out | ios::binary);
+  if (!colData) {
+    cout << "Unable to write column" << endl;
+    return -1;
+  }
+
+  colData.write((char*)value, arr_byte_size);
+  colData.close();
+
+  string offsets_filename = DATA_DIR + lookup(col_name) + ".valbinoff";
+  ofstream offsetsData (offsets_filename.c_str(), ios::out | ios::binary);
+  if (!offsetsData) {
+    cout << "Unable to write offsets" << endl;
+    return -1;
+  }
+
+  offsetsData.write((char*)val_offsets, (num_blocks + 1) * sizeof(int));
+  offsetsData.close();
+
+  return 0;
+}
+
+int storeEncodedRunLengthColumn(string col_name, uint* run_length, uint arr_byte_size, uint* rl_offsets, uint num_blocks) {
+  string filename = DATA_DIR + lookup(col_name) + ".rlbin";
+  ofstream colData (filename.c_str(), ios::out | ios::binary);
+  if (!colData) {
+    cout << "Unable to write column" << endl;
+    return -1;
+  }
+
+  colData.write((char*)run_length, arr_byte_size);
+  colData.close();
+
+  string offsets_filename = DATA_DIR + lookup(col_name) + ".rlbinoff";
+  ofstream offsetsData (offsets_filename.c_str(), ios::out | ios::binary);
+  if (!offsetsData) {
+    cout << "Unable to write offsets" << endl;
+    return -1;
+  }
+
+  offsetsData.write((char*)rl_offsets, (num_blocks + 1) * sizeof(int));
+  offsetsData.close();
+
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  // if (argc != 2) {
+  //   cout << "encode <col-name>" << endl;
+  //   return 1;
+  // }
+
+  string col_name = "lo_orderkey";
+  int len = LO_LEN;
+
+  uint *raw = loadColumn<uint>(col_name, len);
+
+  cout << "Loaded Column" << endl;
+
+  int block_size = 512;
+  int elem_per_thread = 1; //the only difference for RLE
+  int tile_size = block_size * elem_per_thread;
+  int adjusted_len = ((len + tile_size - 1)/tile_size) * tile_size;
+  int num_blocks = adjusted_len / block_size;
+
+  uint *col = new uint[adjusted_len]();
+  memcpy(col, raw, len * sizeof(uint));
+
+  uint *value = new uint[adjusted_len]();
+  uint *run_length = new uint[adjusted_len]();
+  uint *val_offsets = new uint[num_blocks + 1]();
+  uint *rl_offsets = new uint[num_blocks + 1]();
+
+  // extend with the last value to make it multiple of 128
+  for (int i = len; i < adjusted_len ;i++) col[i] = raw[len-1];
+
+  pair<uint, uint> ret = rleBinPack(col, value, run_length, val_offsets, rl_offsets, adjusted_len);
+  cout << "Num Elements " << len << endl;
+  cout << "Input: ArrSize " << len * 4 << endl;
+  cout << "Output: ArrSize " << (ret.first + ret.second) * 4 << " Offsets " << num_blocks + 1 << endl;
+
+  storeEncodedValueColumn(col_name, value, ret.first * 4, val_offsets, num_blocks);
+  storeEncodedRunLengthColumn(col_name, run_length, ret.second * 4, rl_offsets, num_blocks);
+
+  return 0;
+}
diff --git a/tile_based/src/rlebinpack_kernel.cuh b/tile_based/src/rlebinpack_kernel.cuh
new file mode 100644
index 0000000..73abcc7
--- /dev/null
+++ b/tile_based/src/rlebinpack_kernel.cuh
@@ -0,0 +1,146 @@
+#pragma once
+#include <cub/cub.cuh>
+using namespace cub;
+
+__forceinline__ __device__ int decodeElementRBin(int i, uint* data_block, uint reference, uint bitwidth) {
+
+  uint start_bitindex = (bitwidth * i);
+  uint start_intindex = (start_bitindex >> 5); // 3 for reference bitwidth and count
+
+  start_bitindex = start_bitindex & (32-1);
+
+  /*unsigned long long element_block = *((unsigned long long*)&data_block[miniblock_offset + start_intindex]);*/
+  unsigned long long element_block = (((unsigned long long)data_block[start_intindex + 1]) << 32) | data_block[start_intindex];
+  uint element = (element_block >> start_bitindex) & ((1LL<<bitwidth) - 1LL);
+
+  return reference + element;
+}
+
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__forceinline__ __device__ void LoadRBinPack(uint* val_block_start, uint* rl_block_start,
+    uint* value, uint* run_length, uint* shared_buffer, int (&items_value)[ITEMS_PER_THREAD],
+    int (&items_run_length)[ITEMS_PER_THREAD], bool is_last_tile, int num_tile_items) {
+
+  typedef cub::BlockExchange<int, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchange;
+
+  // Specialize BlockScan for a 1D block of 128 threads on type int
+  typedef cub::BlockScan<int, BLOCK_THREADS> BlockScan;
+
+  uint num_decode;
+
+  int tile_idx = blockIdx.x;
+
+  // Block start indices of 5 blocks converted into integer offsets.
+  uint *val_block_starts = &shared_buffer[0];
+  uint *rl_block_starts = &shared_buffer[BLOCK_THREADS * ITEMS_PER_THREAD];
+  if (threadIdx.x  < 2) {
+    val_block_starts[threadIdx.x] = val_block_start[tile_idx + threadIdx.x];
+    rl_block_starts[threadIdx.x] = rl_block_start[tile_idx + threadIdx.x];
+  }
+
+  __syncthreads();
+
+  // Shared memory for 4 blocks of encoded l_shipdate data
+  uint* val_data_block = &val_block_starts[0];
+  uint* rl_data_block = &rl_block_starts[0];
+
+  // // Lets load 4 blocks from the encoded column
+  uint start_offset_val = val_block_starts[0];
+  uint end_offset_val = val_block_starts[1];
+  uint start_offset_rl = rl_block_starts[0];
+  uint end_offset_rl = rl_block_starts[1];
+
+  __syncthreads();
+
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+    uint index = start_offset_val + threadIdx.x + (i * BLOCK_THREADS); // i * 128
+    if (index < end_offset_val) {
+      val_data_block[threadIdx.x + (i * BLOCK_THREADS)] = value[index];
+    }
+    index = start_offset_rl + threadIdx.x + (i * BLOCK_THREADS); // i * 128
+    if (index < end_offset_rl) {
+      rl_data_block[threadIdx.x + (i * BLOCK_THREADS)] = run_length[index];
+    }
+  }
+
+  __syncthreads();
+
+  uint count = val_data_block[2]; // == rl_ptr[2]
+  uint offset = 0;
+  num_decode = ((count + ITEMS_PER_THREAD - 1) / ITEMS_PER_THREAD);
+
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+
+    uint* val_ptr = val_data_block + 3;
+    uint* rl_ptr = rl_data_block + 3;
+
+    uint reference, bitwidth;
+    if (threadIdx.x < num_decode && threadIdx.x + offset < count) {
+      reference = val_data_block[0];
+      bitwidth = val_data_block[1] & 255;
+      items_value[i] = decodeElementRBin(threadIdx.x + offset, val_ptr, reference, bitwidth);
+      reference = rl_data_block[0];
+      bitwidth = rl_data_block[1] & 255;
+      items_run_length[i] = decodeElementRBin(threadIdx.x + offset, rl_ptr, reference, bitwidth);
+    } else {
+      items_value[i] = 0;
+      items_run_length[i] = 0;
+    }
+
+    offset += num_decode;
+
+  }
+
+  __syncthreads();
+
+  typename BlockScan::TempStorage *temp_storage_scan = reinterpret_cast<typename BlockScan::TempStorage*>(rl_data_block);
+  typename BlockExchange::TempStorage *temp_storage_exchange = reinterpret_cast<typename BlockExchange::TempStorage*>(rl_data_block);
+
+  BlockExchange(*temp_storage_exchange).StripedToBlocked(items_run_length);
+
+  __syncthreads();
+
+  /*// Also accepts an initial value.*/
+  BlockScan(*temp_storage_scan).InclusiveSum(items_run_length, items_run_length);
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    val_data_block[threadIdx.x * ITEMS_PER_THREAD + i] = 0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    val_data_block[items_run_length[i]] = 1;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    items_run_length[i] = val_data_block[threadIdx.x * ITEMS_PER_THREAD + i];
+  }
+
+  __syncthreads();
+
+  BlockScan(*temp_storage_scan).InclusiveSum(items_run_length, items_run_length);
+
+  __syncthreads();
+
+  BlockExchange(*temp_storage_exchange).BlockedToStriped(items_run_length);
+
+  __syncthreads();
+
+
+  offset = 0;
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    if (threadIdx.x < num_decode) val_data_block[threadIdx.x + offset] = items_value[i];
+    offset += num_decode;
+  }
+
+  __syncthreads();
+
+
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    items_value[i] = val_data_block[items_run_length[i]];
+  }
+}
\ No newline at end of file
diff --git a/tile_based/src/test_match_rle.cu b/tile_based/src/test_match_rle.cu
new file mode 100644
index 0000000..293871f
--- /dev/null
+++ b/tile_based/src/test_match_rle.cu
@@ -0,0 +1,137 @@
+#define CUB_STDERR
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <chrono>
+#include <bitset>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include <cub/cub.cuh>
+
+#include "kernel.cuh" // FLS_CHG
+#include "utils/gpu_utils.h"
+#include "ssb_gpu_utils.h"
+#include "rlebinpack_kernel.cuh"
+
+using namespace std;
+using namespace cub;
+
+CachingDeviceAllocator  g_allocator(true);
+
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void runRBinKernel(
+    int* col, 
+    uint* val_block_start, uint* val_data, uint* rl_block_start, uint* rl_data,
+    int num_entries) {
+  int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;
+  int tile_idx = blockIdx.x;
+  int tile_offset = tile_idx * tile_size;
+
+  // Load a segment of consecutive items that are blocked across threads
+  int val_block[ITEMS_PER_THREAD];
+  int rl_block[ITEMS_PER_THREAD];
+
+  int num_tiles = (num_entries + tile_size - 1) / tile_size;
+  int num_tile_items = tile_size;
+  bool is_last_tile = false;
+  if (tile_idx == num_tiles - 1) {
+    num_tile_items = num_entries - tile_offset;
+    is_last_tile = true;
+  }
+
+  extern __shared__ uint shared_buffer[];
+  // __shared__ uint shared_buffer[128 + BLOCK_THREADS * ITEMS_PER_THREAD * 2];
+  LoadRBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(val_block_start, rl_block_start,
+    val_data, rl_data, shared_buffer, val_block, rl_block, is_last_tile, num_tile_items);
+
+  __syncthreads();
+
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+    col[tile_size * tile_idx + i * BLOCK_THREADS + threadIdx.x] = val_block[i];
+  }
+}
+
+float runSinglePass(
+    encoded_column val_col, encoded_column rl_col,
+    int num_items, string encoding,
+    CachingDeviceAllocator&  g_allocator, int* h_col_orig) {
+  // Kernel timing
+  float time_query;
+  SETUP_TIMING();
+
+  int* col = NULL, *unpack_bitpack = NULL, *for_decoded = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &col, num_items * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &unpack_bitpack, num_items * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &for_decoded, num_items * sizeof(int)));
+
+  const int num_threads = 128;
+  const int items_per_thread = 4; //the only difference for RLE
+  int tile_size = num_threads * items_per_thread;
+
+  // Run kernel
+  cudaEventRecord(start, 0);
+  if (encoding == "rbin") {
+    TIME_FUNC((runRBinKernel<num_threads, items_per_thread><<<(num_items + tile_size - 1)/tile_size, num_threads, 4096>>>(
+      col, 
+      val_col.block_start, val_col.data, rl_col.block_start, rl_col.data,
+      num_items 
+    )), time_query);
+
+  } 
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Copy revenue from device to host 
+  int* h_col = new int[num_items];
+  CubDebugExit(cudaMemcpy(h_col, col, sizeof(int) * num_items, cudaMemcpyDeviceToHost));
+
+  for (int i=0; i<num_items; i++) {
+    if (h_col_orig[i] != h_col[i]) {
+      cout << "ERROR:" << i << " " << h_col_orig[i] << " " << h_col[i] << endl;
+      return -1;
+    }
+  }
+  cout << "Inputs match ! " << endl;
+
+  return time_query;
+}
+
+/***
+  * The goal is to test is col encoding can be decoded and if it same as original array.
+  */
+int main(int argc, char** argv) {
+  int num_trials = 3;
+
+  // if (argc != 2) return 0;
+
+  //./bin/ssb/test_match_rle lo_orderkey
+  string column_name = "lo_orderkey";
+  string encoding = "rbin";
+
+  int len = LO_LEN;
+
+  int *h_col_orig = loadColumn<int>(column_name, len);
+
+  encoded_column val_col = loadEncodedColumnToGPURLE(column_name, "valbin", len, g_allocator);
+  encoded_column rl_col = loadEncodedColumnToGPURLE(column_name, "rlbin", len, g_allocator);
+
+  cudaDeviceSynchronize();
+
+  float time_query;
+  time_query = runSinglePass(val_col, rl_col,
+                     len, encoding,
+                     g_allocator, h_col_orig);
+
+  cout << "{" << "\"query\":6" << ",\"time_query\":" << time_query << "}" << endl;
+
+  cudaDeviceSynchronize(); 
+
+  return 0;
+}
\ No newline at end of file
diff --git a/tile_based/src/test_perf_rle.cu b/tile_based/src/test_perf_rle.cu
new file mode 100644
index 0000000..fb29ef6
--- /dev/null
+++ b/tile_based/src/test_perf_rle.cu
@@ -0,0 +1,126 @@
+#define CUB_STDERR
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <chrono>
+#include <bitset>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include <cub/cub.cuh>
+
+#include "kernel.cuh"
+#include "utils/gpu_utils.h"
+#include "ssb_gpu_utils.h"
+#include "rlebinpack_kernel.cuh"
+
+using namespace std;
+using namespace cub;
+
+CachingDeviceAllocator  g_allocator(true);
+
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void runRBinKernel(
+    int* col,
+    uint* val_block_start, uint* val_data, uint* rl_block_start, uint* rl_data,
+    int num_entries) {
+  int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;
+  int tile_idx = blockIdx.x;
+  int tile_offset = tile_idx * tile_size;
+
+  // Load a segment of consecutive items that are blocked across threads
+  int val_block[ITEMS_PER_THREAD];
+  int rl_block[ITEMS_PER_THREAD];
+
+  int num_tiles = (num_entries + tile_size - 1) / tile_size;
+  int num_tile_items = tile_size;
+  bool is_last_tile = false;
+  if (tile_idx == num_tiles - 1) {
+    num_tile_items = num_entries - tile_offset;
+    is_last_tile = true;
+  }
+
+  extern __shared__ uint shared_buffer[];
+  LoadRBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(val_block_start, rl_block_start,
+    val_data, rl_data, shared_buffer, val_block, rl_block, is_last_tile, num_tile_items);
+
+  __syncthreads();
+
+  for (int i=0; i<ITEMS_PER_THREAD; i++) {
+    col[tile_size * tile_idx + i * BLOCK_THREADS + threadIdx.x] = val_block[i];
+  }
+}
+
+
+float runSinglePass(
+    encoded_column val_col, encoded_column rl_col,
+    int num_items, string encoding,
+    CachingDeviceAllocator&  g_allocator, int* col) {
+  // Kernel timing
+  float time_query;
+  SETUP_TIMING();
+
+  int *unpack_bitpack = NULL, *for_decoded = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &unpack_bitpack, num_items * sizeof(int)));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &for_decoded, num_items * sizeof(int)));
+
+  const int num_threads = 128;
+  const int items_per_thread = 4; //the only difference for RLE
+  int tile_size = num_threads * items_per_thread;
+
+  // Run kernel
+  cudaEventRecord(start, 0);
+  if (encoding == "rbin") {
+    TIME_FUNC((runRBinKernel<num_threads, items_per_thread><<<(num_items + tile_size - 1)/tile_size, num_threads, 4096>>>(
+      col,
+      val_col.block_start, val_col.data, rl_col.block_start, rl_col.data,
+      num_items
+    )), time_query);
+
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  return time_query;
+}
+
+/***
+  * The goal is to test is col encoding can be decoded and if it same as original array.
+  */
+int main(int argc, char** argv) {
+  int num_trials = 3;
+
+  //./bin/ssb/test_match_rle lo_orderkey
+  string column_name = "lo_orderkey";
+  string encoding = "rbin";
+
+  int len = LO_LEN;
+
+  encoded_column val_col = loadEncodedColumnToGPURLE(column_name, "valbin", len, g_allocator);
+  encoded_column rl_col = loadEncodedColumnToGPURLE(column_name, "rlbin", len, g_allocator);
+
+  int *col;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &col, len * sizeof(int)));
+
+  cudaDeviceSynchronize();
+
+  for (int t = 0; t < num_trials; t++) {
+    float time_query;
+
+    time_query = runSinglePass(val_col, rl_col,
+                     len, encoding,
+                     g_allocator, col);
+
+    cout << "{" << "\"query\":6" << ",\"time_query\":" << time_query << "}" << endl;
+
+    cudaDeviceSynchronize();
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based.cu b/tile_based/src/tile_based.cu
new file mode 100644
index 0000000..907fd43
--- /dev/null
+++ b/tile_based/src/tile_based.cu
@@ -0,0 +1,3 @@
+//
+// Created by Azim Afroozeh on 17/03/2024.
+//
diff --git a/tile_based/src/tile_based_bench_bitpack.cu b/tile_based/src/tile_based_bench_bitpack.cu
new file mode 100644
index 0000000..137b429
--- /dev/null
+++ b/tile_based/src/tile_based_bench_bitpack.cu
@@ -0,0 +1,182 @@
+#include "config.hpp"
+#include "cub/util_debug.cuh"
+#include "tile_based/kernel.cuh"
+#include <cuda_profiler_api.h>
+
+uint32_t bin_pack(uint32_t*& in, uint32_t*& out, uint32_t*& block_offsets, uint32_t tup_c) {
+	uint32_t out_ofs = 0;
+
+	uint32_t block_size      = 128;
+	uint32_t miniblock_count = 4;
+	uint32_t miniblock_size  = block_size / miniblock_count;
+	uint32_t total_count     = tup_c;
+	uint32_t first_val       = in[0];
+
+	out[0] = block_size;
+	out[1] = miniblock_count;
+	out[2] = total_count;
+	out[3] = first_val;
+
+	out_ofs += 4;
+
+	for (uint32_t idx = 0; idx < tup_c; idx += block_size) {
+		uint32_t blk_idx       = idx / block_size;
+		block_offsets[blk_idx] = out_ofs;
+
+		// Find min val
+		uint32_t min_val = in[0];
+		for (int i = 1; i < block_size; i++) {
+			if (in[i] < min_val) { min_val = in[i]; }
+		}
+
+		for (int i = 0; i < block_size; i++) {
+			in[i] = in[i] - min_val;
+		}
+
+		uint32_t* miniblock_bitwidths = new uint32_t[miniblock_count];
+		for (int i = 0; i < miniblock_count; i++) {
+			miniblock_bitwidths[i] = 0;
+		}
+
+		for (uint32_t miniblock = 0; miniblock < miniblock_count; miniblock++) {
+			for (uint32_t i = 0; i < miniblock_size; i++) {
+				uint32_t bitwidth = uint32_t(ceil(log2(in[miniblock * miniblock_size + i] + 1)));
+				if (bitwidth > miniblock_bitwidths[miniblock]) { miniblock_bitwidths[miniblock] = bitwidth; }
+			}
+		}
+
+		// Extra for Simple BinPack
+		uint32_t max_bitwidth = miniblock_bitwidths[0];
+		for (int i = 1; i < miniblock_count; i++) {
+			max_bitwidth = std::max(max_bitwidth, miniblock_bitwidths[i]);
+		}
+		for (int i = 0; i < miniblock_count; i++) {
+			miniblock_bitwidths[i] = max_bitwidth;
+		}
+
+		out[out_ofs] = min_val;
+		out_ofs++;
+
+		out[out_ofs] = miniblock_bitwidths[0] + (miniblock_bitwidths[1] << 8) + (miniblock_bitwidths[2] << 16) +
+		               (miniblock_bitwidths[3] << 24);
+		out_ofs++;
+
+		for (int miniblock = 0; miniblock < miniblock_count; miniblock++) {
+			uint32_t bitwidth = miniblock_bitwidths[miniblock];
+			uint32_t shift    = 0;
+			for (int i = 0; i < miniblock_size; i++) {
+				if (shift + bitwidth > 32) {
+					if (shift != 32) { out[out_ofs] += in[miniblock * miniblock_size + i] << shift; }
+					out_ofs++;
+					shift        = (shift + bitwidth) & (32 - 1);
+					out[out_ofs] = in[miniblock * miniblock_size + i] >> (bitwidth - shift);
+				} else {
+					out[out_ofs] += in[miniblock * miniblock_size + i] << shift;
+					shift += bitwidth;
+				}
+			}
+			out_ofs++;
+		}
+
+		// Increment the input pointer by block size
+		in += block_size;
+	}
+
+	block_offsets[tup_c / block_size] = out_ofs;
+
+	return out_ofs;
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void run_bin_kernel(int* col, uint* col_block_start, uint* col_data) {
+
+	int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int tile_idx  = blockIdx.x;
+
+	// Load a segment of consecutive items that are blocked across threads
+	int col_block[ITEMS_PER_THREAD];
+
+	extern __shared__ uint shared_buffer[];
+
+	load_bin_pack<BLOCK_THREADS, ITEMS_PER_THREAD>(col_block_start, col_data, shared_buffer, col_block);
+
+	// write unpacked values directly to global memory
+	for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+		col[tile_size * tile_idx + i * 128 + threadIdx.x] = col_block[i];
+	}
+}
+
+namespace tile_based {
+template <typename T>
+T* loadColumnToGPU(T* src, int len) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, sizeof(T) * len);
+	CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * len, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+} // namespace tile_based
+
+int main(int argc, char** argv) {
+
+	cudaSetDevice(0);
+	int bitwidth = 3;
+
+	if (argc > 1) { bitwidth = atoi(argv[1]); }
+
+	std::cout << "Bitwidth set to " << bitwidth << std::endl;
+
+	uint64_t tup_c         = 1 << 28;
+	auto*    original_data = new uint32_t[tup_c];
+	uint32_t mask          = (1 << bitwidth) - 1;
+
+	/* generate random numbers. */
+	for (int i = 0; i < tup_c; i++) {
+		original_data[i] = rand() & mask;
+	}
+
+	int      block_size      = 128;
+	int      elem_per_thread = 4;
+	int      tile_size       = block_size * elem_per_thread;
+	int      num_blocks      = tup_c / block_size;
+	auto*    encoded_data    = new uint32_t[tup_c]();
+	uint64_t ofs_c           = num_blocks + 1;
+	auto*    ofs_arr         = new uint32_t[ofs_c]();
+	auto*    copy_data       = new uint32_t[tup_c];
+
+	/* Data needs to be copied. the encoding change the original data. */
+	memcpy(copy_data, original_data, tup_c * sizeof(int));
+
+	// extend with the last value to make it multiple of 128
+	uint32_t encoded_data_bsz = bin_pack(copy_data, encoded_data, ofs_arr, tup_c);
+
+	tile_based::encoded_column h_col {ofs_arr, encoded_data, tup_c * 4};
+
+	uint* d_col_block_start = tile_based::loadColumnToGPU<uint>(h_col.block_start, num_blocks + 1);
+	uint* d_col_data        = tile_based::loadColumnToGPU<uint>(h_col.data, h_col.data_size / 4);
+
+	tile_based::encoded_column d_col {d_col_block_start, d_col_data};
+
+	cudaDeviceSynchronize();
+
+	const int num_threads      = 128;
+	const int items_per_thread = 4;
+	int*      col              = nullptr;
+	cudaMalloc((void**)&col, tup_c * sizeof(int));
+	size_t Dg = (tup_c + tile_size - 1) / tile_size;
+	size_t Db = num_threads;
+	size_t Ns = 3000;
+
+	run_bin_kernel<num_threads, items_per_thread><<<Dg, Db, Ns>>>(col, d_col.block_start, d_col.data);
+
+	int* temp = new int[tup_c];
+	CubDebugExit(cudaMemcpy(temp, col, sizeof(int) * tup_c, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < tup_c; i++) {
+		if (original_data[i] != temp[i]) {
+			std::cout << "ERROR:" << i << " " << original_data[i] << " " << temp[i] << '\n';
+			return -1;
+		}
+	}
+	std::cout << "-- Inputs match! " << '\n';
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based_bench_bp_sum.cu b/tile_based/src/tile_based_bench_bp_sum.cu
new file mode 100644
index 0000000..b3e0bb1
--- /dev/null
+++ b/tile_based/src/tile_based_bench_bp_sum.cu
@@ -0,0 +1,225 @@
+// #include "config.hpp"
+// #include "crystal/crystal.cuh"
+// #include "cub/util_debug.cuh"
+// #include "kernel.cuh"
+// #include <cuda_profiler_api.h>
+// #include <gpu_utils.h>
+//
+// using namespace std;
+// using namespace fastlanes::gpu;
+//
+// struct QueryMtd {
+// 	fastlanes::n_t     n_vec;
+// 	uint8_t            bw;
+// 	fastlanes::n_t     n_tup;
+// 	unsigned long long result;
+// };
+//
+// uint32_t bin_pack(uint32_t*& in, uint32_t*& out, uint32_t*& block_offsets, uint32_t tup_c) {
+// 	uint32_t out_ofs = 0;
+//
+// 	uint32_t block_size      = 128;
+// 	uint32_t miniblock_count = 4;
+// 	uint32_t miniblock_size  = block_size / miniblock_count;
+// 	uint32_t total_count     = tup_c;
+// 	uint32_t first_val       = in[0];
+//
+// 	out[0] = block_size;
+// 	out[1] = miniblock_count;
+// 	out[2] = total_count;
+// 	out[3] = first_val;
+//
+// 	out_ofs += 4;
+//
+// 	for (uint32_t idx = 0; idx < tup_c; idx += block_size) {
+// 		uint32_t blk_idx       = idx / block_size;
+// 		block_offsets[blk_idx] = out_ofs;
+//
+// 		// Find min val
+// 		uint32_t min_val = in[0];
+// 		for (int i = 1; i < block_size; i++) {
+// 			if (in[i] < min_val) { min_val = in[i]; }
+// 		}
+//
+// 		for (int i = 0; i < block_size; i++) {
+// 			in[i] = in[i] - min_val;
+// 		}
+//
+// 		uint32_t* miniblock_bitwidths = new uint32_t[miniblock_count];
+// 		for (int i = 0; i < miniblock_count; i++) {
+// 			miniblock_bitwidths[i] = 0;
+// 		}
+//
+// 		for (uint32_t miniblock = 0; miniblock < miniblock_count; miniblock++) {
+// 			for (uint32_t i = 0; i < miniblock_size; i++) {
+// 				uint32_t bitwidth = uint32_t(ceil(log2(in[miniblock * miniblock_size + i] + 1)));
+// 				if (bitwidth > miniblock_bitwidths[miniblock]) { miniblock_bitwidths[miniblock] = bitwidth; }
+// 			}
+// 		}
+//
+// 		// Extra for Simple BinPack
+// 		uint32_t max_bitwidth = miniblock_bitwidths[0];
+// 		for (int i = 1; i < miniblock_count; i++) {
+// 			max_bitwidth = std::max(max_bitwidth, miniblock_bitwidths[i]);
+// 		}
+// 		for (int i = 0; i < miniblock_count; i++) {
+// 			miniblock_bitwidths[i] = max_bitwidth;
+// 		}
+//
+// 		out[out_ofs] = min_val;
+// 		out_ofs++;
+//
+// 		out[out_ofs] = miniblock_bitwidths[0] + (miniblock_bitwidths[1] << 8) + (miniblock_bitwidths[2] << 16) +
+// 		               (miniblock_bitwidths[3] << 24);
+// 		out_ofs++;
+//
+// 		for (int miniblock = 0; miniblock < miniblock_count; miniblock++) {
+// 			uint32_t bitwidth = miniblock_bitwidths[miniblock];
+// 			uint32_t shift    = 0;
+// 			for (int i = 0; i < miniblock_size; i++) {
+// 				if (shift + bitwidth > 32) {
+// 					if (shift != 32) { out[out_ofs] += in[miniblock * miniblock_size + i] << shift; }
+// 					out_ofs++;
+// 					shift        = (shift + bitwidth) & (32 - 1);
+// 					out[out_ofs] = in[miniblock * miniblock_size + i] >> (bitwidth - shift);
+// 				} else {
+// 					out[out_ofs] += in[miniblock * miniblock_size + i] << shift;
+// 					shift += bitwidth;
+// 				}
+// 			}
+// 			out_ofs++;
+// 		}
+//
+// 		// Increment the input pointer by block size
+// 		in += block_size;
+// 	}
+//
+// 	block_offsets[tup_c / block_size] = out_ofs;
+//
+// 	return out_ofs;
+// }
+//
+// template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+// __global__ void run_bin_kernel(uint* col_block_start, uint* col_data, unsigned long long* revenue) {
+// 	uint32_t               items[ITEMS_PER_THREAD];
+// 	extern __shared__ uint shared_buffer[];
+//
+// 	unsigned long long sum = 0;
+// 	LoadBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(col_block_start, col_data, shared_buffer, items);
+//
+// #pragma unroll
+// 	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+// 		sum += items[ITEM];
+// 	}
+//
+// 	__syncthreads();
+//
+// 	static __shared__ long long buffer[32];
+// 	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+// 	__syncthreads();
+//
+// 	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+// }
+//
+// template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+// float query_aggregate(uint*                        col_block_start,
+//                       uint*                        col_data,
+//                       QueryMtd                     hardcoded,
+//                       cub::CachingDeviceAllocator& g_allocator) {
+// 	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+//
+// 	SETUP_TIMING();
+// 	float                                     time_query;
+// 	chrono::high_resolution_clock::time_point st, finish;
+// 	st = chrono::high_resolution_clock::now();
+// 	cudaEventRecord(start, 0);
+// 	unsigned long long* d_sum = NULL;
+// 	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+//
+// 	cudaMemset(d_sum, 0, sizeof(long long));
+//
+// 	// Run
+// 	run_bin_kernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+// 	    <<<hardcoded.n_vec, BLOCK_THREADS, 3000>>>(col_block_start, col_data, d_sum);
+//
+// 	cudaEventRecord(stop, 0);
+// 	cudaEventSynchronize(stop);
+// 	cudaEventElapsedTime(&time_query, start, stop);
+//
+// 	unsigned long long revenue;
+// 	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+//
+// 	finish                             = chrono::high_resolution_clock::now();
+// 	std::chrono::duration<double> diff = finish - st;
+//
+// 	double total_time_taken {diff.count() * 1000};
+// 	FLS_SHOW(total_time_taken)
+//
+// 	/*Check the result*/
+// 	FLS_SHOW(revenue)
+// 	if (revenue != hardcoded.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+//
+// 	CLEANUP(d_sum);
+//
+// 	return time_query;
+// }
+
+int main() {
+
+	// cudaSetDevice(0);
+	// constexpr uint64_t n_tile        = 2 * 256 * 1024;
+	// constexpr uint64_t tile_sz       = 512;
+	// constexpr uint64_t n_tup         = n_tile * tile_sz;
+	// constexpr uint64_t block_sz      = 128;
+	// int                n_block       = n_tup / block_sz;
+	// uint64_t           n_ofc         = n_block + 1;
+	// auto*              original_data = new uint32_t[n_tup];
+	// auto*              encoded_data  = new uint32_t[n_tup];
+	// auto*              copy_data     = new uint32_t[n_tup];
+	// auto*              ofs_arr       = new uint32_t[n_ofc];
+	// for (uint8_t bitwidth {0}; bitwidth < 33; bitwidth++) {
+	//
+	// 	uint32_t mask = (1 << bitwidth) - 1;
+	// 	uint32_t bw   = bitwidth;
+	//
+	// 	FLS_SHOW(bw)
+	// 	uint64_t sum {0};
+	// 	/* generate random numbers. */
+	// 	for (int i = 0; i < n_tup; i++) {
+	// 		original_data[i] = i & mask;
+	// 		sum += original_data[i];
+	// 	}
+	// 	FLS_SHOW(sum)
+	//
+	//
+	// 	/* Data needs to be copied. the encoding change the original data. */
+	// 	memcpy(copy_data, original_data, n_tup * sizeof(int));
+	//
+	// 	// extend with the last value to make it multiple of 128
+	//
+	// 	auto copy_data_als = copy_data;
+	// 	auto size = bin_pack(copy_data_als, encoded_data, ofs_arr, n_tup);
+	// 	double real_bw = size * 32 / double(n_tup);
+	// 	FLS_SHOW(real_bw)
+	//
+	// 	uint* d_col_block_start = load_to_gpu<uint>(ofs_arr, (n_block + 1) * 4, g_allocator);
+	// 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+	// 	uint* d_col_data = load_to_gpu<uint>(encoded_data, n_tup * 4, g_allocator);
+	// 	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+	//
+	// 	QueryMtd query_mtd {n_tile, bitwidth, n_tup, sum};
+	// 	for (int i {0}; i < 1; ++i) {
+	// 		auto time = query_aggregate<128, 4>(d_col_block_start, d_col_data, query_mtd, g_allocator);
+	// 		FLS_SHOW(time)
+	// 	}
+	//
+	// 	CLEANUP(d_col_block_start)
+	// 	CLEANUP(d_col_data)
+	//
+	// }
+	//
+	// delete original_data;
+	// delete encoded_data;
+	// delete copy_data;
+	// delete ofs_arr;
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based_bench_delta.cu b/tile_based/src/tile_based_bench_delta.cu
new file mode 100644
index 0000000..98f97ce
--- /dev/null
+++ b/tile_based/src/tile_based_bench_delta.cu
@@ -0,0 +1,254 @@
+#include "config.hpp"
+#include "cub/util_debug.cuh"
+#include "kernel.cuh"
+#include "binpack_kernel.cuh"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+
+uint deltaBinPack(int*& in, int*& out, uint*& block_offsets, uint num_entries) {
+	uint offset = 0;
+
+	uint block_size      = 128;
+	uint elem_per_thread = 4;
+	uint tile_size       = block_size * elem_per_thread;
+
+	uint miniblock_count = 4;
+	uint total_count     = num_entries;
+	uint first_val       = in[0];
+
+	out[0] = block_size;
+	out[1] = miniblock_count;
+	out[2] = total_count;
+	out[3] = first_val;
+
+	offset += 4;
+
+	for (uint tile_start = 0; tile_start < num_entries; tile_start += tile_size) {
+		uint block_index   = tile_start / block_size;
+		int  tmp_first_val = in[0];
+
+		out[offset] = tmp_first_val;
+		offset++;
+
+		// Compute the deltas
+		for (int i = tile_size - 1; i > 0; i--) {
+			in[i] = in[i] - in[i - 1];
+		}
+		in[0] = 0;
+
+		for (int block_start = 0; block_start < block_size * 4; block_start += block_size, block_index += 1) {
+			block_offsets[block_index] = offset;
+
+			// For FOR - Find min val
+			int min_val = in[0];
+			for (int i = 1; i < block_size; i++) {
+				if (in[i] < min_val) { min_val = in[i]; }
+			}
+
+			min_val = 0; /* HACK */
+			for (int i = 0; i < block_size; i++) {
+				in[i] = in[i] - min_val;
+			}
+
+			out[offset] = min_val;
+			offset++;
+
+			// Subtracting min_val ensures that all input vals are >= 0
+			// Going forward in and out will both be treated as unsigned integers.
+			uint* inp  = (uint*)in;
+			uint* outp = (uint*)out;
+
+			uint  miniblock_size      = block_size / miniblock_count;
+			uint* miniblock_bitwidths = new uint[miniblock_count];
+			for (int i = 0; i < miniblock_count; i++) {
+				miniblock_bitwidths[i] = 0;
+			}
+
+			for (uint miniblock = 0; miniblock < miniblock_count; miniblock++) {
+				for (uint i = 0; i < miniblock_size; i++) {
+					uint bitwidth = uint(ceil(log2(inp[miniblock * miniblock_size + i] + 1)));
+					if (bitwidth > miniblock_bitwidths[miniblock]) { miniblock_bitwidths[miniblock] = bitwidth; }
+				}
+			}
+
+			// Extra for Simple BinPack
+			uint max_bitwidth = miniblock_bitwidths[0];
+			for (int i = 1; i < miniblock_count; i++) {
+				max_bitwidth = max(max_bitwidth, miniblock_bitwidths[i]);
+			}
+			for (int i = 0; i < miniblock_count; i++) {
+				miniblock_bitwidths[i] = max_bitwidth;
+			}
+			outp[offset] = miniblock_bitwidths[0] + (miniblock_bitwidths[1] << 8) + (miniblock_bitwidths[2] << 16) +
+			               (miniblock_bitwidths[3] << 24);
+			offset++;
+
+			for (int miniblock = 0; miniblock < miniblock_count; miniblock++) {
+				uint bitwidth = miniblock_bitwidths[miniblock];
+				uint shift    = 0;
+				for (int i = 0; i < miniblock_size; i++) {
+					if (shift + bitwidth > 32) {
+						if (shift != 32) { outp[offset] += inp[miniblock * miniblock_size + i] << shift; }
+						offset++;
+						shift        = (shift + bitwidth) & (32 - 1);
+						outp[offset] = inp[miniblock * miniblock_size + i] >> (bitwidth - shift);
+					} else {
+						outp[offset] += inp[miniblock * miniblock_size + i] << shift;
+						shift += bitwidth;
+					}
+				}
+				offset++;
+			}
+
+			// Increment the input pointer by block size
+			in += block_size;
+		}
+	}
+
+	block_offsets[num_entries / block_size] = offset;
+
+	return offset;
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void runDBinKernel(int* col, uint* col_block_start, uint* col_data, int num_entries) {
+	int tile_size   = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int tile_idx    = blockIdx.x;
+	int tile_offset = tile_idx * tile_size;
+
+	// Load a segment of consecutive items that are blocked across threads
+	int col_block[ITEMS_PER_THREAD];
+
+	int  num_tiles      = (num_entries + tile_size - 1) / tile_size;
+	int  num_tile_items = tile_size;
+	bool is_last_tile   = false;
+	if (tile_idx == num_tiles - 1) {
+		num_tile_items = num_entries - tile_offset;
+		is_last_tile   = true;
+	}
+
+	extern __shared__ uint shared_buffer[];
+	LoadDBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    col_block_start, col_data, shared_buffer, col_block, is_last_tile, num_tile_items);
+
+	__syncthreads();
+
+	for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+		col[tile_size * tile_idx + i * 128 + threadIdx.x] = col_block[i];
+	}
+}
+
+namespace tile_based {
+template <typename T>
+T* loadColumnToGPU(T* src, int len) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, sizeof(T) * len);
+	CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * len, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+}
+
+int main() {
+
+	cudaSetDevice(0);
+
+	/* Init */
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Init :  \n";
+
+	uint64_t  n_tup            = 1 << 28;
+	auto*     h_org_arr        = new uint32_t[n_tup];
+	int       block_size       = 128;
+	int       elem_per_thread  = 4;
+	int       tile_size        = block_size * elem_per_thread;
+	int       num_blocks       = n_tup / block_size;
+	auto*     encoded_data     = new int[n_tup]();
+	uint64_t  ofs_c            = num_blocks + 1;
+	auto*     ofs_arr          = new uint[ofs_c]();
+	auto*     copy_data        = new int[n_tup];
+	const int num_threads      = 128;
+	const int items_per_thread = 4;
+	int*      col              = nullptr;
+	size_t    dg               = (n_tup + tile_size - 1) / tile_size;
+	size_t    db               = num_threads;
+	size_t    ns               = 3000;
+	int*      temp             = new int[n_tup];
+	// int       num_trials       = 10;
+	cudaMalloc((void**)&col, n_tup * sizeof(int));
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Generate :  \n";
+	std::cout << "-- delta " << tile_based::delta << '\n';
+
+	/* generate 0, 5, 10. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = i * tile_based::delta;
+	}
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Encode : \n";
+
+	/* Data needs to be copied. the encoding change the original data. */
+	memcpy(copy_data, h_org_arr, n_tup * sizeof(int));
+
+	// extend with the last value to make it multiple of 128
+	deltaBinPack(copy_data, encoded_data, ofs_arr, n_tup);
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Load encoded data into GPU : \n";
+
+	tile_based::encoded_column h_col {ofs_arr, reinterpret_cast<uint*>(encoded_data), n_tup * 4};
+	uint*                      d_col_block_start = tile_based::loadColumnToGPU<uint>(h_col.block_start, num_blocks + 1);
+	uint*                      d_col_data        = tile_based::loadColumnToGPU<uint>(h_col.data, h_col.data_size / 4);
+
+	tile_based::encoded_column d_col {d_col_block_start, d_col_data};
+
+	cudaDeviceSynchronize();
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Decode :  \n";
+
+	runDBinKernel<num_threads, items_per_thread><<<dg, db, ns>>>(col, d_col.block_start, d_col.data, n_tup);
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Copy data to host :  \n";
+
+	CubDebugExit(cudaMemcpy(temp, col, sizeof(int) * n_tup, cudaMemcpyDeviceToHost));
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Test :  \n";
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != temp[i]) {
+			std::cout << "ERROR:" << i << " " << h_org_arr[i] << " " << temp[i] << '\n';
+			return -1;
+		}
+	}
+
+	std::cout << "-- Inputs match ! " << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Bench :  \n";
+
+#if 0
+	// Run trials
+	for (int t = 0; t < num_trials; t++) {
+		// Kernel timing
+		float query_time;
+		SETUP_TIMING();
+
+		cudaEventRecord(start, nullptr);
+		runDBinKernel<num_threads, items_per_thread><<<dg, db, ns>>>(col, d_col.block_start, d_col.data, n_tup);
+		cudaEventRecord(stop, nullptr);
+
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&query_time, start, stop);
+
+		CubDebugExit(cudaPeekAtLastError());
+		CubDebugExit(cudaDeviceSynchronize());
+
+		std::cout << "-- Query-time: " << std::to_string(t) << " : " << query_time << " ms " << '\n';
+		cudaDeviceSynchronize();
+	}
+
+	return 2;
+#endif
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based_bench_delta_sum.cu b/tile_based/src/tile_based_bench_delta_sum.cu
new file mode 100644
index 0000000..5a4b134
--- /dev/null
+++ b/tile_based/src/tile_based_bench_delta_sum.cu
@@ -0,0 +1,270 @@
+#include "config.hpp"
+#include "cub/util_debug.cuh"
+#include "tile_based/kernel.cuh"
+#include "tile_based/gpu_utils.h"
+#include "tile_based/binpack_kernel.cuh"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+
+uint deltaBinPack(int*& in, int*& out, uint*& block_offsets, uint num_entries) {
+	uint offset = 0;
+
+	uint block_size      = 128;
+	uint elem_per_thread = 4;
+	uint tile_size       = block_size * elem_per_thread;
+
+	uint miniblock_count = 4;
+	uint total_count     = num_entries;
+	uint first_val       = in[0];
+
+	out[0] = block_size;
+	out[1] = miniblock_count;
+	out[2] = total_count;
+	out[3] = first_val;
+
+	offset += 4;
+
+	for (uint tile_start = 0; tile_start < num_entries; tile_start += tile_size) {
+		uint block_index   = tile_start / block_size;
+		int  tmp_first_val = in[0];
+
+		out[offset] = tmp_first_val;
+		offset++;
+
+		// Compute the deltas
+		for (int i = tile_size - 1; i > 0; i--) {
+			in[i] = in[i] - in[i - 1];
+		}
+		in[0] = 0;
+
+		for (int block_start = 0; block_start < block_size * 4; block_start += block_size, block_index += 1) {
+			block_offsets[block_index] = offset;
+
+			// For FOR - Find min val
+			int min_val = in[0];
+			for (int i = 1; i < block_size; i++) {
+				if (in[i] < min_val) { min_val = in[i]; }
+			}
+
+			min_val = 0; /* HACK */
+			for (int i = 0; i < block_size; i++) {
+				in[i] = in[i] - min_val;
+			}
+
+			out[offset] = min_val;
+			offset++;
+
+			// Subtracting min_val ensures that all input vals are >= 0
+			// Going forward in and out will both be treated as unsigned integers.
+			uint* inp  = (uint*)in;
+			uint* outp = (uint*)out;
+
+			uint  miniblock_size      = block_size / miniblock_count;
+			uint* miniblock_bitwidths = new uint[miniblock_count];
+			for (int i = 0; i < miniblock_count; i++) {
+				miniblock_bitwidths[i] = 0;
+			}
+
+			for (uint miniblock = 0; miniblock < miniblock_count; miniblock++) {
+				for (uint i = 0; i < miniblock_size; i++) {
+					uint bitwidth = uint(ceil(log2(inp[miniblock * miniblock_size + i] + 1)));
+					if (bitwidth > miniblock_bitwidths[miniblock]) { miniblock_bitwidths[miniblock] = bitwidth; }
+				}
+			}
+
+			// Extra for Simple BinPack
+			uint max_bitwidth = miniblock_bitwidths[0];
+			for (int i = 1; i < miniblock_count; i++) {
+				max_bitwidth = max(max_bitwidth, miniblock_bitwidths[i]);
+			}
+			for (int i = 0; i < miniblock_count; i++) {
+				miniblock_bitwidths[i] = max_bitwidth;
+			}
+			outp[offset] = miniblock_bitwidths[0] + (miniblock_bitwidths[1] << 8) + (miniblock_bitwidths[2] << 16) +
+			               (miniblock_bitwidths[3] << 24);
+			offset++;
+
+			for (int miniblock = 0; miniblock < miniblock_count; miniblock++) {
+				uint bitwidth = miniblock_bitwidths[miniblock];
+				uint shift    = 0;
+				for (int i = 0; i < miniblock_size; i++) {
+					if (shift + bitwidth > 32) {
+						if (shift != 32) { outp[offset] += inp[miniblock * miniblock_size + i] << shift; }
+						offset++;
+						shift        = (shift + bitwidth) & (32 - 1);
+						outp[offset] = inp[miniblock * miniblock_size + i] >> (bitwidth - shift);
+					} else {
+						outp[offset] += inp[miniblock * miniblock_size + i] << shift;
+						shift += bitwidth;
+					}
+				}
+				offset++;
+			}
+
+			// Increment the input pointer by block size
+			in += block_size;
+		}
+	}
+
+	block_offsets[num_entries / block_size] = offset;
+
+	return offset;
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void runDBinKernel(unsigned long long* col, uint* col_block_start, uint* col_data, int num_entries) {
+	int tile_size   = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int tile_idx    = blockIdx.x;
+	int tile_offset = tile_idx * tile_size;
+
+	// Load a segment of consecutive items that are blocked across threads
+	int col_block[ITEMS_PER_THREAD];
+
+	int  num_tiles      = (num_entries + tile_size - 1) / tile_size;
+	int  num_tile_items = tile_size;
+	bool is_last_tile   = false;
+	if (tile_idx == num_tiles - 1) {
+		num_tile_items = num_entries - tile_offset;
+		is_last_tile   = true;
+	}
+
+	extern __shared__ uint shared_buffer[];
+	LoadDBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(
+	    col_block_start, col_data, shared_buffer, col_block, is_last_tile, num_tile_items);
+
+	__syncthreads();
+
+#if 1
+	unsigned long long sum = 0;
+
+#pragma unroll
+	for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+		sum += col_block[i];
+	}
+
+	static __shared__ unsigned long long buffer[32];
+	unsigned long long aggregate = BlockSum<unsigned long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (unsigned long long*)buffer);
+
+	__syncthreads();
+
+	if (threadIdx.x == 0) {
+		atomicAdd(col, aggregate);
+	}
+
+#endif
+}
+
+namespace tile_based {
+template <typename T>
+T* loadColumnToGPU(T* src, int len) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, sizeof(T) * len);
+	CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * len, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+}
+
+int main() {
+
+	cudaSetDevice(0);
+
+	/* Init */
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Init :  \n";
+
+	uint64_t  n_tup            = 1 << 28;
+	auto*     h_org_arr        = new uint32_t[n_tup];
+	int       block_size       = 128;
+	int       elem_per_thread  = 4;
+	int       tile_size        = block_size * elem_per_thread;
+	int       num_blocks       = n_tup / block_size;
+	auto*     encoded_data     = new int[n_tup]();
+	uint64_t  ofs_c            = num_blocks + 1;
+	auto*     ofs_arr          = new uint[ofs_c]();
+	auto*     copy_data        = new int[n_tup];
+	const int num_threads      = 128;
+	const int items_per_thread = 4;
+	size_t    dg               = (n_tup + tile_size - 1) / tile_size;
+	size_t    db               = num_threads;
+	size_t    ns               = 3000;
+	int*      temp             = new int[n_tup];
+	int       num_trials       = 10;
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Generate :  \n";
+	std::cout << "-- delta " << tile_based::delta << '\n';
+
+	unsigned long long sum;
+
+	/* generate 0, 5, 10. */
+	for (int i = 0; i < n_tup; i++) {
+		h_org_arr[i] = i * tile_based::delta;
+		sum += h_org_arr[i];
+	}
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Encode : \n";
+
+	/* Data needs to be copied. the encoding change the original data. */
+	memcpy(copy_data, h_org_arr, n_tup * sizeof(int));
+
+	// extend with the last value to make it multiple of 128
+	deltaBinPack(copy_data, encoded_data, ofs_arr, n_tup);
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Load encoded data into GPU : \n";
+
+	tile_based::encoded_column h_col {ofs_arr, reinterpret_cast<uint*>(encoded_data), n_tup * 4};
+	uint*                      d_col_block_start = tile_based::loadColumnToGPU<uint>(h_col.block_start, num_blocks + 1);
+	uint*                      d_col_data        = tile_based::loadColumnToGPU<uint>(h_col.data, h_col.data_size / 4);
+
+	tile_based::encoded_column d_col {d_col_block_start, d_col_data};
+
+	cudaDeviceSynchronize();
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Decode :  \n";
+
+	unsigned long long*      col = nullptr;
+
+	cudaMalloc((void**)&col, sizeof(unsigned long long) * 1);
+
+	runDBinKernel<num_threads, items_per_thread><<<dg, db, ns>>>(col, d_col.block_start, d_col.data, n_tup);
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Copy data to host :  \n";
+
+	unsigned long long* result = new unsigned long long[1];
+	CubDebugExit(cudaMemcpy(result, col, sizeof(unsigned long long) * 1, cudaMemcpyDeviceToHost));
+
+	std::cout << "[Sum, Result]: [" << sum << ", " << result[0] << "]" << std::endl;
+
+	if (sum != result[0]){
+		std::cout << "RESULT INCORRECT!" << std::endl;
+	}
+
+	// Run trials
+
+#if 0
+	for (int t = 0; t < num_trials; t++) {
+		// Kernel timing
+		float query_time;
+		SETUP_TIMING();
+
+		cudaEventRecord(start, nullptr);
+		runDBinKernel<num_threads, items_per_thread><<<dg, db, ns>>>(col, d_col.block_start, d_col.data, n_tup);
+		cudaEventRecord(stop, nullptr);
+
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&query_time, start, stop);
+
+		CubDebugExit(cudaPeekAtLastError());
+		CubDebugExit(cudaDeviceSynchronize());
+
+		std::cout << "-- Query-time: " << std::to_string(t) << " : " << query_time << " ms " << '\n';
+		cudaDeviceSynchronize();
+	}
+
+	return 2;
+#endif
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based_bench_rle.cu b/tile_based/src/tile_based_bench_rle.cu
new file mode 100644
index 0000000..a32e02a
--- /dev/null
+++ b/tile_based/src/tile_based_bench_rle.cu
@@ -0,0 +1,138 @@
+#define CUB_STDERR
+
+#include "tile_based/gpu_utils.h"
+#include "tile_based/kernel.cuh"
+#include "tile_based/ssb_gpu_utils.h"
+#include <bitset>
+#include <chrono>
+#include <cub/cub.cuh>
+#include <cub/util_allocator.cuh>
+#include <cuda.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+using namespace std;
+using namespace cub;
+
+CachingDeviceAllocator g_allocator(true);
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+runRBinKernel(int* col, uint* val_block_start, uint* val_data, uint* rl_block_start, uint* rl_data, int num_entries) {
+	int tile_size   = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int tile_idx    = blockIdx.x;
+	int tile_offset = tile_idx * tile_size;
+
+	// Load a segment of consecutive items that are blocked across threads
+	int val_block[ITEMS_PER_THREAD];
+	int rl_block[ITEMS_PER_THREAD];
+
+	int  num_tiles      = (num_entries + tile_size - 1) / tile_size;
+	int  num_tile_items = tile_size;
+	bool is_last_tile   = false;
+	if (tile_idx == num_tiles - 1) {
+		num_tile_items = num_entries - tile_offset;
+		is_last_tile   = true;
+	}
+
+	extern __shared__ uint shared_buffer[];
+	// __shared__ uint shared_buffer[128 + BLOCK_THREADS * ITEMS_PER_THREAD * 2];
+	LoadRBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(val_block_start,
+	                                              rl_block_start,
+	                                              val_data,
+	                                              rl_data,
+	                                              shared_buffer,
+	                                              val_block,
+	                                              rl_block,
+	                                              is_last_tile,
+	                                              num_tile_items);
+
+	__syncthreads();
+
+	for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+		col[tile_size * tile_idx + i * BLOCK_THREADS + threadIdx.x] = val_block[i];
+	}
+}
+
+float runSinglePass(encoded_column          val_col,
+                    encoded_column          rl_col,
+                    int                     num_items,
+                    string                  encoding,
+                    CachingDeviceAllocator& g_allocator,
+                    int*                    h_col_orig) {
+	// Kernel timing
+	float time_query;
+	SETUP_TIMING();
+
+	int *col = NULL, *unpack_bitpack = NULL, *for_decoded = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&col, num_items * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&unpack_bitpack, num_items * sizeof(int)));
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&for_decoded, num_items * sizeof(int)));
+
+	const int num_threads      = 128;
+	const int items_per_thread = 4; // the only difference for RLE
+	int       tile_size        = num_threads * items_per_thread;
+
+	// Run kernel
+	cudaEventRecord(start, 0);
+	if (encoding == "rbin") {
+		TIME_FUNC((runRBinKernel<num_threads, items_per_thread>
+		           <<<(num_items + tile_size - 1) / tile_size, num_threads, 4096>>>(
+		               col, val_col.block_start, val_col.data, rl_col.block_start, rl_col.data, num_items)),
+		          time_query);
+	}
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	CubDebugExit(cudaPeekAtLastError());
+	CubDebugExit(cudaDeviceSynchronize());
+
+	// Copy revenue from device to host
+	int* h_col = new int[num_items];
+	CubDebugExit(cudaMemcpy(h_col, col, sizeof(int) * num_items, cudaMemcpyDeviceToHost));
+
+	for (int i = 0; i < num_items; i++) {
+		if (h_col_orig[i] != h_col[i]) {
+			cout << "ERROR:" << i << " " << h_col_orig[i] << " " << h_col[i] << endl;
+			return -1;
+		}
+	}
+	cout << "Inputs match ! " << endl;
+
+	return time_query;
+}
+
+/***
+ * The goal is to test is col encoding can be decoded and if it same as original array.
+ */
+int main(int argc, char** argv) {
+	int num_trials = 3;
+
+	if (argc != 2) return 0;
+
+	//./bin/ssb/test_match_rle lo_orderkey
+	string column_name = argv[1];
+	string encoding    = "rbin";
+
+	int len = LO_LEN;
+
+	int* h_col_orig = loadColumn<int>(column_name, len);
+
+	encoded_column val_col = loadEncodedColumnToGPURLE(column_name, "valbin", len, g_allocator);
+	encoded_column rl_col  = loadEncodedColumnToGPURLE(column_name, "rlbin", len, g_allocator);
+
+	cudaDeviceSynchronize();
+
+	float time_query;
+	time_query = runSinglePass(val_col, rl_col, len, encoding, g_allocator, h_col_orig);
+
+	cout << "{"
+	     << "\"query\":6"
+	     << ",\"time_query\":" << time_query << "}" << endl;
+
+	cudaDeviceSynchronize();
+
+	return 0;
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based_bench_rle_all_memory.cu b/tile_based/src/tile_based_bench_rle_all_memory.cu
new file mode 100644
index 0000000..e20990b
--- /dev/null
+++ b/tile_based/src/tile_based_bench_rle_all_memory.cu
@@ -0,0 +1,327 @@
+#include "config/tile_based/config.hpp"
+#include "debug/pretty_print.hpp"
+#include "gpu/helper.hpp"
+#include "mixbench-cuda/lcutil.h"
+#include "tile_based/rlebinpack_kernel.cuh"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <iostream>
+#include <string>
+
+std::pair<uint, uint>
+rleBinPack(uint*& in, uint*& value, uint*& run_length, uint*& val_offsets, uint*& rl_offsets, uint num_entries) {
+	uint val_offset = 0;
+	uint rl_offset  = 0;
+
+	uint block_size      = 512;
+	uint elem_per_thread = 1;
+	uint tile_size       = block_size * elem_per_thread;
+
+	// nonblock
+	block_size = tile_size;
+
+	uint miniblock_count = 4;
+	uint total_count     = num_entries;
+	uint first_val       = in[0];
+
+	value[0] = block_size;
+	value[1] = miniblock_count;
+	value[2] = total_count;
+	value[3] = first_val;
+
+	run_length[0] = block_size;
+	run_length[1] = miniblock_count;
+	run_length[2] = total_count;
+	run_length[3] = first_val;
+
+	val_offset += 4;
+	rl_offset += 4;
+
+	uint num_tiles = (num_entries + tile_size - 1) / tile_size;
+
+	uint* val = new uint[tile_size]();
+	uint* rl  = new uint[tile_size]();
+
+	for (uint tile_start = 0; tile_start < num_entries; tile_start += tile_size) {
+		uint block_index = tile_start / block_size;
+
+		uint count = 0;
+		val[count] = in[0];
+		uint run   = 1;
+		for (int i = 1; i < tile_size; i++) {
+			if (in[i] != in[i - 1]) {
+				rl[count] = run;
+				count++;
+				val[count] = in[i];
+				run        = 1;
+			} else {
+				run++;
+			}
+		}
+		rl[count] = run;
+		count++;
+
+		// non block
+		int bl_size     = count;
+		int block_start = 0;
+
+		rl_offsets[block_index]  = rl_offset;
+		val_offsets[block_index] = val_offset;
+
+		uint min_val = val[block_start];
+		uint min_rl  = rl[block_start];
+		for (int i = 1; i < bl_size; i++) {
+			if (val[block_start + i] < min_val) min_val = val[block_start + i];
+			if (rl[block_start + i] < min_rl) min_rl = rl[block_start + i];
+		}
+
+		uint val_bitwidth = 0;
+		uint rl_bitwidth  = 0;
+
+		for (int i = block_start; i < block_start + bl_size; i++) {
+			val[i]        = val[i] - min_val;
+			rl[i]         = rl[i] - min_rl;
+			uint bitwidth = uint(ceil(log2(val[i] + 1)));
+			val_bitwidth  = std::max(val_bitwidth, bitwidth);
+			bitwidth      = uint(ceil(log2(rl[i] + 1)));
+			rl_bitwidth   = std::max(rl_bitwidth, bitwidth);
+		}
+
+		value[val_offset]     = min_val;
+		run_length[rl_offset] = min_rl;
+		val_offset++;
+		rl_offset++;
+
+		value[val_offset]     = val_bitwidth + (val_bitwidth << 8) + (val_bitwidth << 16) + (val_bitwidth << 24);
+		run_length[rl_offset] = rl_bitwidth + (rl_bitwidth << 8) + (rl_bitwidth << 16) + (rl_bitwidth << 24);
+		val_offset++;
+		rl_offset++;
+
+		if (block_start == (bl_size * (elem_per_thread - 1))) { // if last block
+			value[val_offset]     = count - bl_size * (elem_per_thread - 1);
+			run_length[rl_offset] = count - bl_size * (elem_per_thread - 1);
+		} else {
+			value[val_offset]     = bl_size;
+			run_length[rl_offset] = bl_size;
+		}
+		val_offset++;
+		rl_offset++;
+
+		uint bitwidth = val_bitwidth;
+		uint shift    = 0;
+		for (int i = block_start; i < block_start + bl_size; i++) {
+			if (shift + bitwidth > 32) {
+				if (shift != 32) value[val_offset] += val[i] << shift;
+				val_offset++;
+				shift             = (shift + bitwidth) & (32 - 1);
+				value[val_offset] = val[i] >> (bitwidth - shift);
+			} else {
+				value[val_offset] += val[i] << shift;
+				shift += bitwidth;
+			}
+		}
+		val_offset++;
+
+		bitwidth = rl_bitwidth;
+		shift    = 0;
+		for (int i = block_start; i < block_start + bl_size; i++) {
+			if (shift + bitwidth > 32) {
+				if (shift != 32) run_length[rl_offset] += rl[i] << shift;
+				rl_offset++;
+				shift                 = (shift + bitwidth) & (32 - 1);
+				run_length[rl_offset] = rl[i] >> (bitwidth - shift);
+			} else {
+				run_length[rl_offset] += rl[i] << shift;
+				shift += bitwidth;
+			}
+		}
+		rl_offset++;
+
+		in += tile_size;
+	}
+
+	val_offsets[num_entries / block_size] = val_offset;
+	rl_offsets[num_entries / block_size]  = rl_offset;
+
+	return std::make_pair(val_offset, rl_offset);
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+runRBinKernel(int* col, uint* val_block_start, uint* val_data, uint* rl_block_start, uint* rl_data, int num_entries) {
+	int tile_size   = BLOCK_THREADS * ITEMS_PER_THREAD;
+	int tile_idx    = blockIdx.x;
+	int tile_offset = tile_idx * tile_size;
+
+	// Load a segment of consecutive items that are blocked across threads
+	int val_block[ITEMS_PER_THREAD];
+	int rl_block[ITEMS_PER_THREAD];
+
+	int  num_tiles      = (num_entries + tile_size - 1) / tile_size;
+	int  num_tile_items = tile_size;
+	bool is_last_tile   = false;
+	if (tile_idx == num_tiles - 1) {
+		num_tile_items = num_entries - tile_offset;
+		is_last_tile   = true;
+	}
+
+	extern __shared__ uint shared_buffer[];
+	LoadRBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(val_block_start,
+	                                              rl_block_start,
+	                                              val_data,
+	                                              rl_data,
+	                                              shared_buffer,
+	                                              val_block,
+	                                              rl_block,
+	                                              is_last_tile,
+	                                              num_tile_items);
+
+	__syncthreads();
+
+	for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+		col[tile_size * tile_idx + i * BLOCK_THREADS + threadIdx.x] = val_block[i];
+	}
+}
+
+namespace tile_based {
+template <typename T>
+T* loadColumnToGPU(T* src, int len) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, sizeof(T) * len);
+	CUDA_SAFE_CALL(cudaMemcpy(dest, src, sizeof(T) * len, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+} // namespace tile_based
+
+int main() {
+	fastlanes::gpu::helper::print_cuda_info();
+
+	cudaSetDevice(0);
+	StoreDeviceInfo(stdout);
+	/* Init */
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Init :  \n";
+
+	uint64_t  n_tup            = 1 << 28;
+	auto*     h_org_arr        = new uint32_t[n_tup];
+	int       block_size       = 128;
+	int       elem_per_thread  = 4;
+	int       tile_size        = block_size * elem_per_thread;
+	int       n_blocks         = n_tup / block_size;
+	uint64_t  n_ofs            = n_blocks + 1;
+	auto*     h_val_arr        = new uint32_t[n_tup]();
+	auto*     h_len_arr        = new uint32_t[n_tup]();
+	auto*     h_val_ofs_arr    = new uint32_t[n_ofs]();
+	auto*     h_len_ofs_arr    = new uint32_t[n_ofs]();
+	auto*     h_copy_data      = new uint32_t[n_tup]();
+	const int num_threads      = 128;
+	const int items_per_thread = 4;
+	int*      d_decoded_arr    = nullptr;
+	size_t    dg               = (n_tup + tile_size - 1) / tile_size;
+	size_t    db               = num_threads;
+	auto*     h_decoded_arr    = new uint32_t[n_tup];
+	int       num_trials       = 0;
+	cudaMalloc((void**)&d_decoded_arr, n_tup * sizeof(uint32_t));
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Generate :  \n";
+	std::cout << "-- delta " << tile_based::delta << '\n';
+	SHOW(n_tup)
+
+	/* generate 0, 5, 10. */
+	for (size_t idx = 0, run_idx = 0; idx < n_tup; ++idx) {
+		for (size_t i {0}; i < 8; ++i, idx++) {
+			h_org_arr[idx] = 200;
+		}
+		h_val_arr[run_idx] = 200;
+		h_len_arr[run_idx] = 8;
+		run_idx            = run_idx + 1;
+
+		for (size_t i {0}; i < 8; ++i, idx++) {
+			h_org_arr[idx] = 300;
+		}
+		h_val_arr[run_idx] = 300;
+		h_len_arr[run_idx] = 8;
+		run_idx            = run_idx + 1;
+	}
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Encode : \n";
+
+	/* Data needs to be copied. the encoding change the original data. */
+	memcpy(h_copy_data, h_org_arr, n_tup * sizeof(int));
+
+	// extend with the last value to make it multiple of 128
+	debug::pretty::print_table<uint32_t, 128, 1>(h_copy_data);
+
+	auto pair = rleBinPack(h_copy_data, h_val_arr, h_len_arr, h_val_ofs_arr, h_len_ofs_arr, n_tup);
+	SHOW(pair.first)
+	SHOW(pair.second)
+
+	debug::pretty::print_table<uint32_t, 128, 1>(h_org_arr);
+	debug::pretty::print_table<uint32_t, 128, 1>(h_copy_data);
+	debug::pretty::print_table<uint32_t, 128, 1>(h_val_arr);
+	debug::pretty::print_table<uint32_t, 128, 1>(h_len_arr);
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Load encoded data into GPU : \n";
+
+	uint* d_val_ofs_arr = tile_based::loadColumnToGPU<uint>(h_val_ofs_arr, n_ofs);
+	uint* d_val_arr     = tile_based::loadColumnToGPU<uint>(h_val_arr, n_tup);
+	uint* d_len_ofs_arr = tile_based::loadColumnToGPU<uint>(h_len_ofs_arr, n_ofs);
+	uint* d_len_arr     = tile_based::loadColumnToGPU<uint>(h_len_arr, n_tup);
+
+	cudaDeviceSynchronize();
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Decode :  \n";
+
+	runRBinKernel<num_threads, items_per_thread>
+	    <<<dg, db, 4096>>>(d_decoded_arr, d_val_ofs_arr, d_val_arr, d_len_ofs_arr, d_len_arr, n_tup);
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Copy data to host :  \n";
+
+	cudaDeviceSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(h_decoded_arr, d_decoded_arr, sizeof(int) * n_tup, cudaMemcpyDeviceToHost));
+	cudaDeviceSynchronize();
+
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Test :  \n";
+	for (int i = 0; i < n_tup; i++) {
+		if (h_org_arr[i] != h_decoded_arr[i]) {
+			std::cout << "ERROR:" << i << " " << h_org_arr[i] << " " << h_decoded_arr[i] << '\n';
+			return -1;
+		}
+	}
+
+	std::cout << "-- Inputs match ! " << '\n';
+	std::cout << "------------------------------------ \n";
+	std::cout << "-- Bench :  \n";
+
+	// Run trials
+	for (int t = 0; t < num_trials; t++) {
+		// Kernel timing
+		float query_time;
+		SETUP_TIMING();
+
+		cudaEventRecord(start, nullptr);
+		runRBinKernel<num_threads, items_per_thread>
+		    <<<dg, db, 4096>>>(d_decoded_arr, d_val_ofs_arr, d_val_arr, d_len_ofs_arr, d_val_arr, n_tup);
+		cudaEventRecord(stop, nullptr);
+
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&query_time, start, stop);
+
+		CubDebugExit(cudaPeekAtLastError());
+		CubDebugExit(cudaDeviceSynchronize());
+
+		std::cout << "-- Query-time: " << std::to_string(t) << " : " << query_time << " ms " << '\n';
+		std::cout << "-- Effective-memory-bandwidth: " << std::to_string(t) << " : "
+		          << fastlanes::gpu::helper::BWEffective(3 * n_tup / 8, n_tup * 4, query_time) << " GB/s" << '\n';
+		cudaDeviceSynchronize();
+	}
+
+	return 2;
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based_binpack_query_11.cu b/tile_based/src/tile_based_binpack_query_11.cu
new file mode 100644
index 0000000..2845377
--- /dev/null
+++ b/tile_based/src/tile_based_binpack_query_11.cu
@@ -0,0 +1,281 @@
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iostream>
+#include <stdio.h>
+#include <curand.h>
+
+#include <cuda.h>
+#include <cub/util_allocator.cuh>
+#include <cub/cub.cuh>
+
+// #include "cub/test/test_util.h"
+#include "utils/gpu_utils.h"
+#include "ssb_gpu_utils.h"
+#include "econfig.h"
+
+using namespace std;
+using namespace cub;
+
+/**
+ * Globals, constants and typedefs
+ */
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+template<typename T>
+T* loadToGPU(T* src, int numEntries, CachingDeviceAllocator& g_allocator) {
+  T* dest;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&dest, sizeof(T) * numEntries));
+  CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * numEntries, cudaMemcpyHostToDevice));
+  return dest;
+}
+
+template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void QueryKernel(
+    uint* lo_orderdate_block_start, uint* lo_orderdate_data,
+    uint* lo_discount_block_start, uint* lo_discount_data,
+    uint* lo_quantity_block_start, uint* lo_quantity_data,
+    uint* lo_extendedprice_block_start, uint* lo_extendedprice_data,
+    int lo_num_entries, unsigned long long* revenue) {
+  typedef cub::BlockReduce<int, BLOCK_THREADS> BlockReduceInt;
+
+  int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;
+  int tile_idx = blockIdx.x;    // Current tile index
+  int tile_offset = tile_idx * tile_size;
+
+  // Allocate shared memory for BlockLoad
+  __shared__ union TempStorage
+  {
+    typename BlockReduceInt::TempStorage reduce;
+    uint shared_buffer[BLOCK_THREADS * ITEMS_PER_THREAD];
+  } temp_storage;
+
+  // Load a segment of consecutive items that are blocked across threads
+  int items[ITEMS_PER_THREAD];
+  int selection_flags[ITEMS_PER_THREAD];
+  int items2[ITEMS_PER_THREAD];
+
+  long long sum = 0;
+
+  int num_tiles = (lo_num_entries + tile_size - 1) / tile_size;
+  int num_tile_items = tile_size;
+  bool is_last_tile = false;
+  if (tile_idx == num_tiles - 1) {
+    num_tile_items = lo_num_entries - tile_offset;
+    is_last_tile = true;
+  }
+
+/*  if (is_last_tile)*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_orderdate + tile_offset, items, num_tile_items);*/
+  /*else*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_orderdate + tile_offset, items);*/
+
+  LoadBinPack<BLOCK_THREADS,ITEMS_PER_THREAD>(lo_orderdate_block_start, lo_orderdate_data, temp_storage.shared_buffer, items, is_last_tile, num_tile_items);
+
+  // Barrier for smem reuse
+  __syncthreads();
+
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    // Out-of-bounds items are selection_flags
+    selection_flags[ITEM] = 1;
+
+    if (!is_last_tile || (int(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+      selection_flags[ITEM] = (items[ITEM] > 19930000 && items[ITEM] < 19940000);
+  }
+
+  __syncthreads();
+
+/*  if (is_last_tile)*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_quantity + tile_offset, items, num_tile_items);*/
+  /*else*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_quantity + tile_offset, items);*/
+
+  LoadBinPack<BLOCK_THREADS,ITEMS_PER_THREAD>(lo_quantity_block_start, lo_quantity_data, temp_storage.shared_buffer, items, is_last_tile, num_tile_items);
+
+  // Barrier for smem reuse
+  __syncthreads();
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (!is_last_tile || (int(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+      selection_flags[ITEM] = selection_flags[ITEM] && items[ITEM] < 25;
+  }
+
+  __syncthreads();
+
+/*  if (is_last_tile)*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_discount + tile_offset, items, num_tile_items);*/
+  /*else*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_discount + tile_offset, items);*/
+
+  LoadBinPack<BLOCK_THREADS,ITEMS_PER_THREAD>(lo_discount_block_start, lo_discount_data, temp_storage.shared_buffer, items, is_last_tile, num_tile_items);
+
+  // Barrier for smem reuse
+  __syncthreads();
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (!is_last_tile || (int(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+      selection_flags[ITEM] = selection_flags[ITEM] && items[ITEM] >= 1 && items[ITEM ] <= 3;
+  }
+
+  __syncthreads();
+
+/*  if (is_last_tile)*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_extendedprice + tile_offset, items2, num_tile_items);*/
+  /*else*/
+    /*BlockLoadInt(temp_storage.load_items).Load(lo_extendedprice + tile_offset, items2);*/
+
+  LoadBinPack<BLOCK_THREADS,ITEMS_PER_THREAD>(lo_extendedprice_block_start, lo_extendedprice_data, temp_storage.shared_buffer, items2, is_last_tile, num_tile_items);
+
+  __syncthreads();
+
+/*  #pragma unroll*/
+  /*for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)*/
+  /*{*/
+    /*debug[ITEM * 128 + threadIdx.x] = items2[ITEM]; */
+  /*}*/
+
+  /*return;*/
+
+
+  #pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    if (!is_last_tile || (int(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+      if (selection_flags[ITEM])
+        sum += items[ITEM] * items2[ITEM];
+  }
+
+  __syncthreads();
+
+  unsigned long long aggregate = BlockReduceInt(temp_storage.reduce).Sum(sum);
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(revenue, aggregate);
+  }
+}
+
+float runQuery(encoded_column lo_orderdate, encoded_column lo_discount, encoded_column lo_quantity,
+    encoded_column lo_extendedprice,
+    int lo_num_entries, CachingDeviceAllocator&  g_allocator) {
+  SETUP_TIMING();
+
+  float time_query;
+  chrono::high_resolution_clock::time_point st, finish;
+  st = chrono::high_resolution_clock::now();
+
+  cudaEventRecord(start, 0);
+
+  unsigned long long* d_sum = NULL;
+  CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+  cudaMemset(d_sum, 0, sizeof(long long));
+
+/*  int* debug;*/
+  /*CubDebugExit(g_allocator.DeviceAllocate((void**)&debug, lo_num_entries * sizeof(int)));*/
+  /*cudaMemset(debug, 0, sizeof(int) * lo_num_entries);*/
+
+  // Run
+  const int num_threads = 128;
+  const int items_per_thread = 4;
+  int tile_size = num_threads * items_per_thread;
+  QueryKernel<num_threads, items_per_thread><<<(lo_num_entries + tile_size - 1)/tile_size, 128>>>(
+          lo_orderdate.block_start, lo_orderdate.data,
+          lo_discount.block_start, lo_discount.data,
+          lo_quantity.block_start, lo_quantity.data,
+          lo_extendedprice.block_start, lo_extendedprice.data,
+          lo_num_entries, d_sum);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time_query, start,stop);
+
+/*  int* l_debug = new int[512];*/
+  /*CubDebugExit(cudaMemcpy(l_debug, debug, sizeof(int) * 512, cudaMemcpyDeviceToHost));*/
+
+  /*for (int j=0; j<4; j++) {*/
+      /*for (int i=0; i<128; i++) {*/
+      /*cout << l_debug[j*128 + i] << " ";*/
+      /*}*/
+      /*cout << endl;*/
+  /*}*/
+
+  /*return time_query;*/
+
+  unsigned long long revenue;
+  CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost));
+
+  finish = chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = finish - st;
+
+  cout << "Revenue: " << revenue << endl;
+  cout << "Time Taken Total: " << diff.count() * 1000 << endl;
+
+  CLEANUP(d_sum);
+
+  return time_query;
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_trials  = 3;
+  string encoding = ENCODING;
+
+  // Initialize command line
+  // CommandLineArgs args(argc, argv);
+  // args.GetCmdLineArgument("t", num_trials);
+  // args.GetCmdLineArgument("e", encoding);
+
+  // // Print usage
+  // if (args.CheckCmdLineFlag("help"))
+  // {
+  //     printf("%s "
+  //         "[--t=<num trials>] "
+  //         "[--v] "
+  //         "\n", argv[0]);
+  //     exit(0);
+  // }
+
+  // // Initialize device
+  // CubDebugExit(args.DeviceInit());
+
+  encoded_column h_lo_extendedprice = loadEncodedColumn("lo_extendedprice", encoding, LO_LEN);
+  encoded_column h_lo_discount = loadEncodedColumn("lo_discount", encoding, LO_LEN);
+  encoded_column h_lo_quantity = loadEncodedColumn("lo_quantity", encoding, LO_LEN);
+  encoded_column h_lo_orderdate = loadEncodedColumn("lo_orderdate", encoding, LO_LEN);
+
+  float transfer = 0;
+
+	// encoded_column d_lo_extendedprice = loadEncodedColumnToGPU(h_lo_extendedprice, LO_LEN, g_allocator);
+	// encoded_column d_lo_discount = loadEncodedColumnToGPU(h_lo_discount, LO_LEN, g_allocator);
+	// encoded_column d_lo_quantity = loadEncodedColumnToGPU(h_lo_quantity, LO_LEN, g_allocator);
+	// encoded_column d_lo_orderdate = loadEncodedColumnToGPU(h_lo_orderdate, LO_LEN, g_allocator);
+ //
+ //  cout << "** LOADED DATA TO GPU **" << endl;
+ //  cout << "Encoding: " << encoding << endl;
+ //
+ //  for (int t = 0; t < num_trials; t++) {
+ //    float time_query;
+ //    time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity,
+ //            d_lo_extendedprice,
+ //            LO_LEN, g_allocator);
+ //    cout<< "{"
+ //        << "\"query\":11"
+ //        << ",\"time_query\":" << time_query
+ //        << ",\"time_transfer_query\":" << time_query + transfer
+ //        << "}" << endl;
+ //  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/tile_based/src/tile_based_bitpack_shared_memory.cu b/tile_based/src/tile_based_bitpack_shared_memory.cu
new file mode 100644
index 0000000..a98f786
--- /dev/null
+++ b/tile_based/src/tile_based_bitpack_shared_memory.cu
@@ -0,0 +1,256 @@
+#include "config.hpp"
+#include "crystal/crystal.cuh"
+#include "cub/test/test_util.h"
+#include "data/footer/ssb/ssb.hpp"
+#include "fls_gen/unpack/unpack.cuh"
+#include "gpu_utils.h"
+#include "ssb_utils.h"
+#include "tile_based/kernel.cuh"
+#include <fls_gen/pack/pack.hpp>
+#include <iostream>
+#include <stdio.h>
+
+using namespace std;
+using namespace fastlanes::gpu;
+using namespace fastlanes;
+
+uint32_t bin_pack(uint32_t*& in, uint32_t*& out, uint32_t*& block_offsets, uint32_t tup_c) {
+	uint32_t out_ofs = 0;
+
+	uint32_t block_size      = 128;
+	uint32_t miniblock_count = 4;
+	uint32_t miniblock_size  = block_size / miniblock_count;
+	uint32_t total_count     = tup_c;
+	uint32_t first_val       = in[0];
+
+	out[0] = block_size;
+	out[1] = miniblock_count;
+	out[2] = total_count;
+	out[3] = first_val;
+
+	out_ofs += 4;
+
+	for (uint32_t idx = 0; idx < tup_c; idx += block_size) {
+		uint32_t blk_idx       = idx / block_size;
+		block_offsets[blk_idx] = out_ofs;
+
+		// Find min val
+		uint32_t min_val = in[0];
+		for (int i = 1; i < block_size; i++) {
+			if (in[i] < min_val) { min_val = in[i]; }
+		}
+
+		for (int i = 0; i < block_size; i++) {
+			in[i] = in[i] - min_val;
+		}
+
+		uint32_t* miniblock_bitwidths = new uint32_t[miniblock_count];
+		for (int i = 0; i < miniblock_count; i++) {
+			miniblock_bitwidths[i] = 0;
+		}
+
+		for (uint32_t miniblock = 0; miniblock < miniblock_count; miniblock++) {
+			for (uint32_t i = 0; i < miniblock_size; i++) {
+				uint32_t bitwidth = uint32_t(ceil(log2(in[miniblock * miniblock_size + i] + 1)));
+				if (bitwidth > miniblock_bitwidths[miniblock]) { miniblock_bitwidths[miniblock] = bitwidth; }
+			}
+		}
+
+		// Extra for Simple BinPack
+		uint32_t max_bitwidth = miniblock_bitwidths[0];
+		for (int i = 1; i < miniblock_count; i++) {
+			max_bitwidth = std::max(max_bitwidth, miniblock_bitwidths[i]);
+		}
+		for (int i = 0; i < miniblock_count; i++) {
+			miniblock_bitwidths[i] = max_bitwidth;
+		}
+
+		out[out_ofs] = min_val;
+		out_ofs++;
+
+		out[out_ofs] = miniblock_bitwidths[0] + (miniblock_bitwidths[1] << 8) + (miniblock_bitwidths[2] << 16) +
+		               (miniblock_bitwidths[3] << 24);
+		out_ofs++;
+
+		for (int miniblock = 0; miniblock < miniblock_count; miniblock++) {
+			uint32_t bitwidth = miniblock_bitwidths[miniblock];
+			uint32_t shift    = 0;
+			for (int i = 0; i < miniblock_size; i++) {
+				if (shift + bitwidth > 32) {
+					if (shift != 32) { out[out_ofs] += in[miniblock * miniblock_size + i] << shift; }
+					out_ofs++;
+					shift        = (shift + bitwidth) & (32 - 1);
+					out[out_ofs] = in[miniblock * miniblock_size + i] >> (bitwidth - shift);
+				} else {
+					out[out_ofs] += in[miniblock * miniblock_size + i] << shift;
+					shift += bitwidth;
+				}
+			}
+			out_ofs++;
+		}
+
+		// Increment the input pointer by block size
+		in += block_size;
+	}
+
+	block_offsets[tup_c / block_size] = out_ofs;
+
+	return out_ofs;
+}
+
+struct QueryMtd {
+	n_t      n_vec;
+	uint     bw;
+	n_t      n_tup;
+	uint64_t result;
+};
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void
+QueryKernel(const uint* col_block_start, const uint* col_data, QueryMtd query_mtd, unsigned long long* revenue) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+	// Load a segment of consecutive items that are blocked across threads
+	uint32_t items[ITEMS_PER_THREAD];
+
+	static __shared__ uint32_t unpacked[512];
+
+	long long sum = 0;
+
+	int tile_offset    = blockIdx.x * TILE_SIZE;
+	int num_tiles      = (query_mtd.n_tup + TILE_SIZE - 1) / TILE_SIZE;
+	int num_tile_items = TILE_SIZE;
+	if (blockIdx.x == num_tiles - 1) { num_tile_items = query_mtd.n_tup - tile_offset; }
+
+	LoadBinPack<BLOCK_THREADS, ITEMS_PER_THREAD>(col_block_start, col_data, unpacked, items);
+
+#pragma unroll
+	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+		sum += items[ITEM];
+	}
+
+	__syncthreads();
+
+	static __shared__ long long buffer[32];
+	unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer);
+	__syncthreads();
+
+	if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); }
+}
+
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
+float query_aggregate(const uint*                  col_block_start,
+                      const uint*                  col_data,
+                      QueryMtd                     hardcoded,
+                      cub::CachingDeviceAllocator& g_allocator) {
+	int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+	SETUP_TIMING();
+	float                                     time_query;
+	chrono::high_resolution_clock::time_point st, finish;
+	st = chrono::high_resolution_clock::now();
+	cudaEventRecord(start, 0);
+	unsigned long long* d_sum = NULL;
+	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long)));
+
+	cudaMemset(d_sum, 0, sizeof(long long));
+
+	// Run
+	QueryKernel<BLOCK_THREADS, ITEMS_PER_THREAD>
+	    <<<hardcoded.n_vec, BLOCK_THREADS>>>(col_block_start, col_data, hardcoded, d_sum);
+
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time_query, start, stop);
+
+	unsigned long long revenue;
+	CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+	finish                             = chrono::high_resolution_clock::now();
+	std::chrono::duration<double> diff = finish - st;
+
+	double total_time_taken {diff.count() * 1000};
+	FLS_SHOW(total_time_taken)
+
+	/*Check the result*/
+	FLS_SHOW(revenue)
+	if (revenue != hardcoded.result) { throw std::runtime_error("RESULT INCOREECT!"); }
+
+	CLEANUP(d_sum);
+
+	return time_query;
+}
+
+n_t bitpacked_vec_n_tup(uint bitdwith) {
+	/**/
+	return bitdwith * 32;
+}
+namespace tile_based {
+template <typename T>
+T* loadColumnToGPU(const T* src, int len) {
+	T* dest = nullptr;
+	cudaMalloc((void**)&dest, sizeof(T) * len);
+	CubDebugExit(cudaMemcpy(dest, src, sizeof(T) * len, cudaMemcpyHostToDevice));
+	return dest;
+}
+
+} // namespace tile_based
+
+void shared_memory_bitpacking_with_aggregation() {
+
+	constexpr uint64_t n_vec           = 2 * 256 * 1024;
+	constexpr uint64_t vec_sz          = 512;
+	constexpr uint64_t n_tup           = vec_sz * n_vec;
+	auto*              h_org_arr       = new uint32_t[n_tup];
+	auto*              h_encoded_data  = new uint32_t[n_tup];
+	size_t             repeat          = 3;
+	int                block_size      = 128;
+	int                elem_per_thread = 4;
+	int                tile_size       = block_size * elem_per_thread;
+	int                num_blocks      = n_tup / block_size;
+	auto*              encoded_data    = new uint32_t[n_tup]();
+	uint64_t           ofs_c           = num_blocks + 1;
+	auto*              ofs_arr         = new uint32_t[ofs_c]();
+	auto*              copy_data       = new uint32_t[n_tup];
+
+	for (uint bitwidth {0}; bitwidth < 33; bitwidth++) {
+		uint32_t bw              = bitwidth;
+		uint32_t mask            = (1 << bitwidth) - 1;
+		uint64_t encoded_arr_bsz = n_tup * sizeof(int);
+
+		FLS_SHOW(bw)
+		uint64_t sum {0};
+		/* generate random numbers. */
+		for (int i = 0; i < n_tup; i++) {
+			h_org_arr[i] = 5 & mask;
+			sum += h_org_arr[i];
+		}
+		FLS_SHOW(sum)
+
+		/* Data needs to be copied. the encoding change the original data. */
+		memcpy(copy_data, h_org_arr, n_tup * sizeof(int));
+
+		uint32_t encoded_data_bsz = bin_pack(copy_data, encoded_data, ofs_arr, n_tup);
+
+		tile_based::encoded_column h_col {reinterpret_cast<uint*>(ofs_arr), //
+		                                  reinterpret_cast<uint*>(encoded_data),
+		                                  n_tup * 4};
+
+		uint* d_col_block_start = tile_based::loadColumnToGPU<uint>(h_col.block_start, num_blocks + 1);
+		uint* d_col_data        = tile_based::loadColumnToGPU<uint>(h_col.data, h_col.data_size / 4);
+		tile_based::encoded_column d_col {d_col_block_start, d_col_data};
+
+		QueryMtd query_mtd {n_vec, bitwidth, n_tup, sum};
+		for (int i {0}; i < repeat; ++i) {
+			auto time = query_aggregate<128, 4>(d_col_block_start, d_col_data, query_mtd, g_allocator);
+			FLS_SHOW(time)
+		}
+
+		CLEANUP(d_col_block_start)
+		CLEANUP(d_col_data)
+	}
+}
+
+int main() {
+	/**/
+	shared_memory_bitpacking_with_aggregation();
+}
\ No newline at end of file
diff --git a/tool/CMakeLists.txt b/tool/CMakeLists.txt
new file mode 100644
index 0000000..7470dd1
--- /dev/null
+++ b/tool/CMakeLists.txt
@@ -0,0 +1 @@
+add_executable(device_query device_query.cu)
diff --git a/tool/device_query.cu b/tool/device_query.cu
new file mode 100644
index 0000000..8297d3a
--- /dev/null
+++ b/tool/device_query.cu
@@ -0,0 +1,87 @@
+#include <cstdio>
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+#include <stdio.h>
+
+#define HANDLER_ERROR_MSG(msg) (cudaError(msg, __FILE__, __LINE__))
+
+void cudaError(const char* msg, const char* file, int line) {
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		fprintf(stderr, "%s: %s in %s at line %d\n", msg, cudaGetErrorString(err), file, line);
+		exit(EXIT_FAILURE);
+	}
+}
+
+void deviceProperties() {
+	cudaDeviceProp prop;
+
+	int count, driverVersion = 0, runtimeVersion = 0;
+	cudaGetDeviceCount(&count);
+	HANDLER_ERROR_MSG("device count");
+	for (int i = 0; i < count; i++) {
+		cudaGetDeviceProperties(&prop, i);
+		HANDLER_ERROR_MSG("device prop");
+
+		printf("   --- General Information for device %d ---\n", i);
+		printf("Name:  %s\n", prop.name);
+		cudaDriverGetVersion(&driverVersion);
+		cudaRuntimeGetVersion(&runtimeVersion);
+		printf("CUDA Driver Version  %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10);
+		printf("Runtime Version %d.%d\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+
+		// The compute capability of the device
+		printf("Compute capability:  %d.%d\n", prop.major, prop.minor);
+		// The clock frequency, how fast the actual processors in the GPU are
+		// going
+		printf("\n");
+		printf("Clock rate:  %.0f MHz (%.0f GHz)\n", prop.clockRate * 1e-3f, prop.clockRate * 1e-6f);
+		printf("\n");
+		// The device can concurrently copy memory and execute a kernel.
+
+		printf("Concurrent kernels:  %s \n", prop.concurrentKernels ? "Enabled" : "Disabled");
+#if CUDART_VERSION >= 5000
+		printf("Concurrent copy and kernel execution %s with %d copy engine(s)\n",
+		       (prop.deviceOverlap ? "Enabled" : "Disabled"),
+		       prop.asyncEngineCount);
+#endif
+		// Specified whether there is a run time limit on kernels
+		printf("Kernel execution timeout :  %s \n", prop.kernelExecTimeoutEnabled ? "Enabled" : "Disabled");
+		// The device can use mapped memory
+		printf("Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Enabled" : "Disabled");
+		printf("Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Enabled" : "NoDisabled");
+
+		printf("\n   --- Memory Information for device %d ---\n", i);
+
+#if CUDART_VERSION >= 5000
+		//  how fast the memory in the GPU is operating
+		printf("Memory Clock rate: %f Ghz\n", prop.memoryClockRate * 10e-7);
+		// how many bits of memory are actually being tranferred for each
+		// memory clock cycle
+		printf("Memory Bus Width:  %d-bit\n", prop.memoryBusWidth);
+
+#endif
+
+		printf("Total global mem:  %lf Mbytes (%ld bytes) \n", prop.totalGlobalMem / 1048576.0, prop.totalGlobalMem);
+		printf("Total constant Mem:  %ld bytes\n", prop.totalConstMem);
+		printf("Max mem pitch:  %ld bytes\n", prop.memPitch);
+
+		printf("\n   --- MP Information for device %d ---\n", i);
+		printf("Multiprocessor count:  %d\n", prop.multiProcessorCount);
+		printf("Shared mem per block:  %ld bytes \n", prop.sharedMemPerBlock);
+		printf("Registers per block:  %d\n", prop.regsPerBlock);
+		printf("Threads in warp:  %d\n", prop.warpSize);
+#if CUDART_VERSION >= 5000
+		printf("Max threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
+#endif
+		printf("Max threads per block:  %d\n", prop.maxThreadsPerBlock);
+		printf("Max thread dimensions:  (%d, %d, %d)\n",
+		       prop.maxThreadsDim[0],
+		       prop.maxThreadsDim[1],
+		       prop.maxThreadsDim[2]);
+		printf("Max grid dimensions:  (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
+		printf("\n");
+	}
+}
+
+int main(void) { deviceProperties(); }
\ No newline at end of file
diff --git a/toolchains/T4.cmake b/toolchains/T4.cmake
new file mode 100644
index 0000000..8fdddce
--- /dev/null
+++ b/toolchains/T4.cmake
@@ -0,0 +1,7 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR x86)
+
+set(CMAKE_C_COMPILER /usr/bin/clang-15)
+set(CMAKE_CXX_COMPILER /usr/bin/clang++-15)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
+
diff --git a/toolchains/gtx1080.cmake b/toolchains/gtx1080.cmake
new file mode 100644
index 0000000..fac6805
--- /dev/null
+++ b/toolchains/gtx1080.cmake
@@ -0,0 +1,7 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR x86)
+
+set(CMAKE_C_COMPILER /usr/bin/clang)
+set(CMAKE_CXX_COMPILER /usr/bin/clang++)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.4/bin/nvcc)
+