diff --git a/octreelib/grid/grid.py b/octreelib/grid/grid.py index e0f9da5..81b2ad6 100644 --- a/octreelib/grid/grid.py +++ b/octreelib/grid/grid.py @@ -154,16 +154,8 @@ def map_leaf_points_cuda_ransac( for i in range(0, len(self.__pose_voxel_coordinates), poses_per_batch) ] - # find the maximum number of leaf voxels across all batches # this is needed to initialize the random number generators on the GPU - max_leaf_voxels = max( - [ - sum([self.n_leaves(pose_number) for pose_number in batch_pose_numbers]) - for batch_pose_numbers in batches - ] - ) ransac = CudaRansac( - max_blocks_number=max_leaf_voxels, hypotheses_number=hypotheses_number, threshold=threshold, ) diff --git a/octreelib/ransac/cuda_ransac.py b/octreelib/ransac/cuda_ransac.py index 5f8b6e4..f803559 100644 --- a/octreelib/ransac/cuda_ransac.py +++ b/octreelib/ransac/cuda_ransac.py @@ -2,14 +2,11 @@ import numpy.typing as npt import numba as nb from numba import cuda -from numba.cuda.random import create_xoroshiro128p_states from octreelib.internal import PointCloud from octreelib.ransac.util import ( - generate_random_indices, get_plane_from_points, measure_distance, - INITIAL_POINTS_NUMBER, ) __all__ = ["CudaRansac"] @@ -25,22 +22,22 @@ class CudaRansac: def __init__( self, - max_blocks_number: int, threshold: float = 0.01, hypotheses_number: int = CUDA_THREADS, + initial_points_number: int = 6, ): """ Initialize the RANSAC parameters. :param threshold: Distance threshold. :param hypotheses_number: Number of RANSAC hypotheses. (<= 1024) - :param max_blocks_number: Maximum number of blocks. + :param initial_points_number: Number of initial points to use in RANSAC. """ self.__threshold: float = threshold self.__threads_per_block = min(hypotheses_number, CUDA_THREADS) - # create random number generator states - self.__rng_states = create_xoroshiro128p_states( - self.__threads_per_block * max_blocks_number, seed=0 + self.__kernel = self.__get_kernel(initial_points_number) + self.__random_hypotheses_cuda = cuda.to_device( + np.random.random((self.__threads_per_block, initial_points_number)) ) def evaluate( @@ -78,12 +75,12 @@ def evaluate( mask_mutex = cuda.to_device(np.zeros(blocks_number, dtype=np.int32)) # call the kernel - self.__ransac_kernel[blocks_number, self.__threads_per_block]( + self.__kernel[blocks_number, self.__threads_per_block]( point_cloud_cuda, block_sizes_cuda, block_start_indices_cuda, + self.__random_hypotheses_cuda, self.__threshold, - self.__rng_states, result_mask_cuda, max_inliers_number_cuda, mask_mutex, @@ -94,65 +91,63 @@ def evaluate( return result_mask @staticmethod - @cuda.jit - def __ransac_kernel( - point_cloud: PointCloud, - block_sizes: npt.NDArray, - block_start_indices: npt.NDArray, - threshold: float, - rng_states, - result_mask: npt.NDArray, - max_inliers_number: npt.NDArray, - mask_mutex: npt.NDArray, - ): - thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x + def __get_kernel(initial_points_number): + @cuda.jit + def kernel( + point_cloud: PointCloud, + block_sizes: npt.NDArray, + block_start_indices: npt.NDArray, + random_hypotheses: npt.NDArray, + threshold: float, + result_mask: npt.NDArray, + max_inliers_number: npt.NDArray, + mask_mutex: npt.NDArray, + ): + thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x - if block_sizes[block_id] < INITIAL_POINTS_NUMBER: - return + if block_sizes[block_id] < initial_points_number: + return - # select random points as inliers - initial_point_indices = cuda.local.array( - shape=INITIAL_POINTS_NUMBER, dtype=nb.size_t - ) - initial_point_indices = generate_random_indices( - initial_point_indices, - rng_states, - block_sizes[block_id], - INITIAL_POINTS_NUMBER, - ) - for i in range(INITIAL_POINTS_NUMBER): - initial_point_indices[i] = ( - block_start_indices[block_id] + initial_point_indices[i] + # select random points as inliers + initial_point_indices = cuda.local.array( + shape=initial_points_number, dtype=nb.size_t + ) + for i in range(initial_points_number): + initial_point_indices[i] = nb.int32( + random_hypotheses[thread_id][i] * block_sizes[block_id] + + block_start_indices[block_id] + ) + + # calculate the plane coefficients + plane = cuda.local.array(shape=4, dtype=nb.float32) + plane[0], plane[1], plane[2], plane[3] = get_plane_from_points( + point_cloud, initial_point_indices ) - # calculate the plane coefficients - plane = cuda.local.array(shape=4, dtype=nb.float32) - plane[0], plane[1], plane[2], plane[3] = get_plane_from_points( - point_cloud, initial_point_indices - ) - - # for each point in the block check if it is an inlier - inliers_number_local = 0 - for i in range(block_sizes[block_id]): - point = point_cloud[block_start_indices[block_id] + i] - distance = measure_distance(plane, point) - if distance < threshold: - inliers_number_local += 1 - - # replace the maximum number of inliers if the current number is greater - cuda.atomic.max(max_inliers_number, block_id, inliers_number_local) - cuda.syncthreads() - # set the best mask index for this block - # if this thread has the maximum number of inliers - if ( - inliers_number_local == max_inliers_number[block_id] - and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0 - ): + # for each point in the block check if it is an inlier + inliers_number_local = 0 for i in range(block_sizes[block_id]): - if ( - measure_distance( - plane, point_cloud[block_start_indices[block_id] + i] - ) - < threshold - ): - result_mask[block_start_indices[block_id] + i] = True + point = point_cloud[block_start_indices[block_id] + i] + distance = measure_distance(plane, point) + if distance < threshold: + inliers_number_local += 1 + + # replace the maximum number of inliers if the current number is greater + cuda.atomic.max(max_inliers_number, block_id, inliers_number_local) + cuda.syncthreads() + # set the best mask index for this block + # if this thread has the maximum number of inliers + if ( + inliers_number_local == max_inliers_number[block_id] + and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0 + ): + for i in range(block_sizes[block_id]): + if ( + measure_distance( + plane, point_cloud[block_start_indices[block_id] + i] + ) + < threshold + ): + result_mask[block_start_indices[block_id] + i] = True + + return kernel diff --git a/octreelib/ransac/util.py b/octreelib/ransac/util.py index 4649fb0..5890a62 100644 --- a/octreelib/ransac/util.py +++ b/octreelib/ransac/util.py @@ -2,18 +2,11 @@ These functions are auxiliary functions for the RANSAC algorithm. They cannot be defined inside the `CudaRansac` class because `CudaRansac.__ransac_kernel` would not be able to access them. -This file also contains the `INITIAL_POINTS_NUMBER` constant which -can be used to configure the number of initial points to be used in the RANSAC algorithm. """ import math -import numba as nb from numba import cuda -from numba.cuda.random import xoroshiro128p_uniform_float32 - -# This constant configures the number of initial points to be used in the RANSAC algorithm. -INITIAL_POINTS_NUMBER = 6 @cuda.jit( @@ -34,36 +27,6 @@ def measure_distance(plane, point): ) -@cuda.jit(device=True, inline=True) -def generate_random_int(rng_states, lower_bound, upper_bound): - """ - Generate a random number between a and b. - :param rng_states: Random number generator states. - :param lower_bound: Lower bound. - :param upper_bound: Upper bound. - """ - thread_id = cuda.grid(1) - x = xoroshiro128p_uniform_float32(rng_states, thread_id) - return nb.int32(x * (upper_bound - lower_bound) + lower_bound) - - -@cuda.jit(device=True, inline=True) -def generate_random_indices( - initial_point_indices, rng_states, block_size, points_number -): - """ - Generate random points from the given block. - :param initial_point_indices: Array to store the initial point indices. - :param rng_states: Random number generator states. - :param block_size: Size of the block. - :param points_number: Number of points to generate. - """ - - for i in range(points_number): - initial_point_indices[i] = generate_random_int(rng_states, 0, block_size) - return initial_point_indices - - @cuda.jit(device=True, inline=True) def get_plane_from_points(points, initial_point_indices): """ @@ -79,9 +42,9 @@ def get_plane_from_points(points, initial_point_indices): centroid_y += points[idx][1] centroid_z += points[idx][2] - centroid_x /= INITIAL_POINTS_NUMBER - centroid_y /= INITIAL_POINTS_NUMBER - centroid_z /= INITIAL_POINTS_NUMBER + centroid_x /= initial_point_indices.shape[0] + centroid_y /= initial_point_indices.shape[0] + centroid_z /= initial_point_indices.shape[0] xx, xy, xz, yy, yz, zz = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0