diff --git a/CMakeLists.txt b/CMakeLists.txt index 68a7b26..6c6b113 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,20 @@ if(BUILD_BENCHMARKS) ${Iridescence_LIBRARIES} ) + # KdTree construction benchmark + add_executable(kdtree_benchmark + src/kdtree_benchmark.cpp + ) + target_include_directories(kdtree_benchmark PUBLIC + include + ${TBB_INCLUDE_DIRS} + ${EIGEN3_INCLUDE_DIR} + ) + target_link_libraries(kdtree_benchmark + fmt::fmt + ${TBB_LIBRARIES} + ) + if(BUILD_WITH_PCL) # Downsampling benchmark add_executable(downsampling_benchmark diff --git a/README.md b/README.md index f0b59a9..587fda2 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # small_gicp (fast_gicp2) -**small_gicp** is a header-only C++ library that provides efficient and parallelized fine point cloud registration algorithms (ICP, Point-to-Plane ICP, GICP, VGICP, etc.). It is a regined and optimized version of its predecessor, [fast_gicp](https://github.com/SMRT-AIST/fast_gicp), with the following features. +**small_gicp** is a header-only C++ library that provides efficient and parallelized fine point cloud registration algorithms (ICP, Point-to-Plane ICP, GICP, VGICP, etc.). It is a refined and optimized version of its predecessor, [fast_gicp](https://github.com/SMRT-AIST/fast_gicp), with the following features. -- **Highly optimized** : The implementation of the core registration algorithm is further optimized from that in fast_gicp. It can provide up to 2x speed up compared to fast_gicp. +- **Highly ptimized** : The implementation of the core registration algorithm is further optimized from that in fast_gicp. It can provide up to 2x speed up compared to fast_gicp. - **All parallerized** : small_gicp provides parallelized implementations of several algorithms in the point cloud registration process (Downsampling, KdTree construction, Normal/covariance estimation). As a parallelism backend, either (or both) of [OpenMP](https://www.openmp.org/) and [Intel TBB](https://github.com/oneapi-src/oneTBB) can be used. - **Minimum dependency** : Only [Eigen](https://eigen.tuxfamily.org/) (and bundled [nanoflann](https://github.com/jlblancoc/nanoflann) and [Sophus](https://github.com/strasdat/Sophus)) are required at a minimum. Optionally, it provides the [PCL](https://pointclouds.org/) registration interface so that it can be used as a drop-in replacement in many systems. - **Customizable** : small_gicp is implemented with the trait mechanism that allows feeding any custom point cloud class to the registration algorithm. Furthermore, the template-based implementation allows customizing the regisration process with your original correspondence estimator and registration factors. @@ -241,13 +241,21 @@ Coming soon. ### Downsampling -- Single-thread `small_gicp::voxelgrid_sampling` is about 1.3x faster than `pcl::VoxelGrid`. -- Multi-thread `small_gicp::voxelgrid_sampling_tbb` (6 threads) is about 3.2x faster than `pcl::VoxelGrid`. +- Single-threaded `small_gicp::voxelgrid_sampling` is about 1.3x faster than `pcl::VoxelGrid`. +- Multi-threaded `small_gicp::voxelgrid_sampling_tbb` (6 threads) is about 3.2x faster than `pcl::VoxelGrid`. - `small_gicp::voxelgrid_sampling` gives accurate downsampling results (almost identical to those of `pcl::VoxelGrid`) while `pcl::ApproximateVoxelGrid` yields spurious points (up to 2x points). -- `small_gicp::voxelgrid_sampling` can process a larger point cloud with a fine voxel resolution compared to `pcl::VoxelGrid`. +- `small_gicp::voxelgrid_sampling` can process a larger point cloud with a fine voxel resolution compared to `pcl::VoxelGrid` (for a point cloud of 150m width, minimum voxel resolution can be 0.07 mm). ![downsampling_comp](docs/assets/downsampling_comp.png) +### KdTree construction + +- Multi-threaded implementation (TBB and OMP) can be up to 4x faster than the single-threaded one (All the implementations are based on nanoflann). +- Basically the processing speed get faster as the number of threads increases, but the speed gain is not monotonic sometimes (because of the scheduling algorithm or some CPU(AMD 5995WX)-specific issues?). +- This benchmark only compares the construction time (query time is not included). + +![kdtree_time](docs/assets/kdtree_time.png) + ### Odometry estimation - Single-thread `small_gicp::GICP` is about 2.4x and 1.9x faster than `pcl::GICP` and `fast_gicp::GICP`, respectively. @@ -259,6 +267,8 @@ Coming soon. ## License This package is released under the MIT license. +If you find this package useful for your project, please consider leaving a comment here. It would help the author gain internal recognition in his organization and keep working on this project. + ## Papers - Kenji Koide, Masashi Yokozuka, Shuji Oishi, and Atsuhiko Banno, Voxelized GICP for Fast and Accurate 3D Point Cloud Registration, ICRA2021 diff --git a/docker/Dockerfile.gcc b/docker/Dockerfile.gcc index f683b0e..ad6e271 100644 --- a/docker/Dockerfile.gcc +++ b/docker/Dockerfile.gcc @@ -18,7 +18,7 @@ RUN rm -rf ./* RUN cmake .. -DBUILD_WITH_TBB=ON RUN cmake --build . -j$(nproc) -RUN cmake .. -DBUILD_TESTS=ON -DBUILD_WITH_TBB=ON -DBUILD_BENCHMARKS=ON -DBUILD_WITH_PCL=ON +RUN cmake .. -DBUILD_TESTS=ON -DBUILD_EXAMPLES=ON -DBUILD_BENCHMARKS=ON -DBUILD_WITH_TBB=ON -DBUILD_WITH_PCL=ON RUN cmake --build . -j$(nproc) RUN ctest -j$(nproc) diff --git a/docs/assets/kdtree_time.png b/docs/assets/kdtree_time.png new file mode 100644 index 0000000..9a8b6a7 Binary files /dev/null and b/docs/assets/kdtree_time.png differ diff --git a/include/small_gicp/util/downsampling.hpp b/include/small_gicp/util/downsampling.hpp index 2bf88b7..294d230 100644 --- a/include/small_gicp/util/downsampling.hpp +++ b/include/small_gicp/util/downsampling.hpp @@ -29,8 +29,12 @@ std::shared_ptr voxelgrid_sampling(const InputPointCloud& poin std::vector> coord_pt(points.size()); for (size_t i = 0; i < traits::size(points); i++) { - // TODO: Check if coord is within 21bit range const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset; + if ((coord < 0).any() || (coord > coord_bit_mask).any()) { + std::cerr << "warning: voxel coord is out of range!!" << std::endl; + coord_pt[i] = {0, i}; + continue; + } // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit) const std::uint64_t bits = // diff --git a/include/small_gicp/util/downsampling_omp.hpp b/include/small_gicp/util/downsampling_omp.hpp index d802b7f..c1f98bf 100644 --- a/include/small_gicp/util/downsampling_omp.hpp +++ b/include/small_gicp/util/downsampling_omp.hpp @@ -30,9 +30,12 @@ std::shared_ptr voxelgrid_sampling_omp(const InputPointCloud& std::vector> coord_pt(points.size()); #pragma omp parallel for num_threads(num_threads) schedule(guided, 32) for (size_t i = 0; i < traits::size(points); i++) { - // TODO: Check if coord is within 21bit range const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset; - + if ((coord < 0).any() || (coord > coord_bit_mask).any()) { + std::cerr << "warning: voxel coord is out of range!!" << std::endl; + coord_pt[i] = {0, i}; + continue; + } // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit) const std::uint64_t bits = // ((coord[0] & coord_bit_mask) << (coord_bit_size * 0)) | // diff --git a/include/small_gicp/util/downsampling_tbb.hpp b/include/small_gicp/util/downsampling_tbb.hpp index 8c1e6b0..10d190e 100644 --- a/include/small_gicp/util/downsampling_tbb.hpp +++ b/include/small_gicp/util/downsampling_tbb.hpp @@ -30,8 +30,12 @@ std::shared_ptr voxelgrid_sampling_tbb(const InputPointCloud& std::vector> coord_pt(points.size()); tbb::parallel_for(tbb::blocked_range(0, traits::size(points), 64), [&](const tbb::blocked_range& range) { for (size_t i = range.begin(); i != range.end(); i++) { - // TODO: Check if coord is within 21bit range const Eigen::Array4i coord = fast_floor(traits::point(points, i) * inv_leaf_size) + coord_offset; + if ((coord < 0).any() || (coord > coord_bit_mask).any()) { + std::cerr << "warning: voxel coord is out of range!!" << std::endl; + coord_pt[i] = {0, i}; + continue; + } // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit) const std::uint64_t bits = // diff --git a/kdtree_time.svg b/kdtree_time.svg new file mode 100644 index 0000000..513916c --- /dev/null +++ b/kdtree_time.svg @@ -0,0 +1,2532 @@ + + + + + + + + 2024-03-30T19:07:54.181443 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/plot_kdtree.py b/scripts/plot_kdtree.py new file mode 100644 index 0000000..01f4091 --- /dev/null +++ b/scripts/plot_kdtree.py @@ -0,0 +1,88 @@ +#!/usr/bin/python3 +import re +import os +import numpy +from matplotlib import pyplot +from collections import namedtuple + +def parse_result(filename): + num_points = [] + results = {} + + with open(filename, 'r') as f: + for line in f.readlines(): + matched = re.match(r'num_threads=(\d+)', line) + if matched: + num_threads = int(matched.group(1)) + continue + + matched = re.match(r'num_points=(\d+)', line) + if matched: + num_points.append(int(matched.group(1))) + continue + + matched = re.match(r'(\S+)_times=(\S+) \+\- (\S+)', line) + if matched: + method = matched.group(1) + mean = float(matched.group(2)) + std = float(matched.group(3)) + + if method not in results: + results[method] = [] + + results[method].append(mean) + continue + + return (num_threads, num_points, results) + +def main(): + results_path = os.path.dirname(__file__) + '/results' + + results = [] + for filename in os.listdir(results_path): + matched = re.match(r'kdtree_benchmark_(\d+).txt', filename) + if not matched: + continue + + results.append(parse_result(results_path + '/' + filename)) + + results = sorted(results, key=lambda x: x[0]) + num_threads = [x[0] for x in results] + num_points = results[0][1] + + fig, axes = pyplot.subplots(1, 2, figsize=(12, 2)) + + axes[0].plot(num_points, results[0][2]['kdtree'], label='kdtree (nanoflann)', marker='o', linestyle='--') + for idx in [1, 3, 5, 7, 8]: + N = num_threads[idx] + axes[0].plot(num_points, results[idx][2]['kdtree_omp'], label='kdtree_omp (%d threads)' % N, marker='s') + axes[0].plot(num_points, results[idx][2]['kdtree_tbb'], label='kdtree_tbb (%d threads)' % N, marker='^') + + baseline = numpy.array(results[0][2]['kdtree']) + axes[1].plot([num_threads[0], num_threads[-1]], [1.0, 1.0], label='kdtree (nanoflann)', linestyle='--') + for idx in [5]: + N = num_points[idx] + axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_omp'][idx] for x in results], label='omp (num_points=%d)' % N, marker='s') + axes[1].plot(num_threads, baseline[idx] / [x[2]['kdtree_tbb'][idx] for x in results], label='tbb (num_points=%d)' % N, marker='^') + + axes[1].set_xscale('log') + + axes[0].set_xlabel('Number of points') + axes[0].set_ylabel('KdTree construction time [msec/scan]') + axes[1].set_xlabel('Number of threads = [1, 2, 4, ..., 128]') + axes[1].set_ylabel('Processing speed ratio (single-thread = 1.0)') + + axes[0].grid() + axes[1].grid() + + axes[0].legend() + axes[1].legend() + + fig.savefig('kdtree_time.svg') + + pyplot.show() + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/run_kdtree_benchmark.sh b/scripts/run_kdtree_benchmark.sh new file mode 100755 index 0000000..e60dce3 --- /dev/null +++ b/scripts/run_kdtree_benchmark.sh @@ -0,0 +1,12 @@ +#!/bin/bash +dataset_path=$1 +exe_path=../build/kdtree_benchmark + +mkdir results +num_threads=(1 2 3 4 5 6 7 8 16 32 64 128) + +for N in ${num_threads[@]}; do + sleep 1 + echo $exe_path $dataset_path --num_threads $N | tee results/kdtree_benchmark_$N.txt + $exe_path $dataset_path --num_threads $N --num_trials 1000 | tee results/kdtree_benchmark_$N.txt +done diff --git a/src/kdtree_benchmark.cpp b/src/kdtree_benchmark.cpp new file mode 100644 index 0000000..e003685 --- /dev/null +++ b/src/kdtree_benchmark.cpp @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char** argv) { + using namespace small_gicp; + + if (argc < 2) { + std::cout << "USAGE: kdtree_benchmark " << std::endl; + std::cout << "OPTIONS:" << std::endl; + std::cout << " --num_threads (default: 4)" << std::endl; + std::cout << " --downsampling_resolution (default: 0.25)" << std::endl; + return 0; + } + + const std::string dataset_path = argv[1]; + + int num_threads = 4; + int num_trials = 100; + + for (int i = 0; i < argc; i++) { + const std::string arg = argv[i]; + if (arg == "--num_threads") { + num_threads = std::stoi(argv[i + 1]); + } else if (arg == "--num_trials") { + num_trials = std::stoi(argv[i + 1]); + } else if (arg.size() >= 2 && arg.substr(0, 2) == "--") { + std::cerr << "unknown option: " << arg << std::endl; + return 1; + } + } + + std::cout << "dataset_path=" << dataset_path << std::endl; + std::cout << "num_threads=" << num_threads << std::endl; + std::cout << "num_trials=" << num_trials << std::endl; + + tbb::global_control tbb_control(tbb::global_control::max_allowed_parallelism, num_threads); + + KittiDataset kitti(dataset_path, 1); + auto raw_points = std::make_shared(kitti.points[0]); + std::cout << "num_raw_points=" << raw_points->size() << std::endl; + + const auto search_voxel_resolution = [&](size_t target_num_points) { + std::pair left = {0.1, voxelgrid_sampling_tbb(*raw_points, 0.1)->size()}; + std::pair right = {1.0, voxelgrid_sampling_tbb(*raw_points, 1.0)->size()}; + + for (int i = 0; i < 20; i++) { + if (left.second < target_num_points) { + left.first *= 0.1; + left.second = voxelgrid_sampling_tbb(*raw_points, left.first)->size(); + continue; + } + if (right.second > target_num_points) { + right.first *= 10.0; + right.second = voxelgrid_sampling_tbb(*raw_points, right.first)->size(); + continue; + } + + const double mid = (left.first + right.first) * 0.5; + const size_t mid_num_points = voxelgrid_sampling_tbb(*raw_points, mid)->size(); + + if (std::abs(1.0 - static_cast(mid_num_points) / target_num_points) < 0.001) { + return mid; + } + + if (mid_num_points > target_num_points) { + left = {mid, mid_num_points}; + } else { + right = {mid, mid_num_points}; + } + } + + return (left.first + right.first) * 0.5; + }; + + std::cout << "---" << std::endl; + + std::vector downsampling_resolutions; + std::vector downsampled; + for (double target = 1.0; target > 0.05; target -= 0.1) { + const double downsampling_resolution = search_voxel_resolution(raw_points->size() * target); + downsampling_resolutions.emplace_back(downsampling_resolution); + downsampled.emplace_back(voxelgrid_sampling_tbb(*raw_points, downsampling_resolution)); + std::cout << "downsampling_resolution=" << downsampling_resolution << std::endl; + std::cout << "num_points=" << downsampled.back()->size() << std::endl; + } + + std::cout << "---" << std::endl; + + // warmup + for (int i = 0; i < 10; i++) { + auto downsampled = voxelgrid_sampling(*raw_points, 0.1); + UnsafeKdTree tree(*downsampled); + UnsafeKdTreeTBB tree_tbb(*downsampled); + UnsafeKdTreeOMP tree_omp(*downsampled, num_threads); + } + + Stopwatch sw; + for (size_t i = 0; i < downsampling_resolutions.size(); i++) { + if (num_threads != 1) { + break; + } + + Summarizer kdtree_times; + sw.start(); + for (size_t j = 0; j < num_trials; j++) { + UnsafeKdTree tree(*downsampled[i]); + sw.lap(); + kdtree_times.push(sw.msec()); + } + std::cout << "kdtree_times=" << kdtree_times.str() << " [msec]" << std::endl; + } + + std::cout << "---" << std::endl; + + for (size_t i = 0; i < downsampling_resolutions.size(); i++) { + Summarizer kdtree_tbb_times; + sw.start(); + for (size_t j = 0; j < num_trials; j++) { + UnsafeKdTreeTBB tree(*downsampled[i]); + sw.lap(); + kdtree_tbb_times.push(sw.msec()); + } + + std::cout << "kdtree_tbb_times=" << kdtree_tbb_times.str() << " [msec]" << std::endl; + } + + std::cout << "---" << std::endl; + + for (size_t i = 0; i < downsampling_resolutions.size(); i++) { + Summarizer kdtree_omp_times; + sw.start(); + for (size_t j = 0; j < num_trials; j++) { + UnsafeKdTreeOMP tree(*downsampled[i], num_threads); + sw.lap(); + kdtree_omp_times.push(sw.msec()); + } + + std::cout << "kdtree_omp_times=" << kdtree_omp_times.str() << " [msec]" << std::endl; + } + + return 0; +}