Skip to content

Commit

Permalink
use shared variables for max_inliers_number and mutex
Browse files Browse the repository at this point in the history
  • Loading branch information
true-real-michael committed Apr 5, 2024
1 parent af36b99 commit 0f094cb
Showing 1 changed file with 16 additions and 18 deletions.
34 changes: 16 additions & 18 deletions octreelib/ransac/cuda_ransac.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ def evaluate(
# create result mask and copy it to the device
result_mask_cuda = cuda.to_device(np.zeros((len(point_cloud)), dtype=np.bool_))

# create arrays to store the maximum number of inliers and the best mask indices
max_inliers_number_cuda = cuda.to_device(
np.zeros(blocks_number, dtype=np.int32)
)

# copy point_cloud, block_sizes and block_start_indices to the device
point_cloud_cuda = cuda.to_device(point_cloud)
block_sizes_cuda = cuda.to_device(block_sizes)
Expand All @@ -71,9 +66,6 @@ def evaluate(
np.cumsum(np.concatenate(([0], block_sizes[:-1])))
)

# this mutex is needed to make sure that only one thread writes to the mask
mask_mutex = cuda.to_device(np.zeros(blocks_number, dtype=np.int32))

# call the kernel
self.__kernel[blocks_number, self.__threads_per_block](
point_cloud_cuda,
Expand All @@ -82,8 +74,6 @@ def evaluate(
self.__random_hypotheses_cuda,
self.__threshold,
result_mask_cuda,
max_inliers_number_cuda,
mask_mutex,
)

# copy result mask back to the host
Expand All @@ -100,8 +90,6 @@ def kernel(
random_hypotheses: npt.NDArray,
threshold: float,
result_mask: npt.NDArray,
max_inliers_number: npt.NDArray,
mask_mutex: npt.NDArray,
):
thread_id, block_id = cuda.threadIdx.x, cuda.blockIdx.x

Expand Down Expand Up @@ -132,22 +120,32 @@ def kernel(
if distance < threshold:
inliers_number_local += 1

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, block_id, inliers_number_local)
# shared memory to store the best plane and the maximum number of inliers
# for all hypotheses
best_plane = cuda.shared.array(shape=4, dtype=nb.float32)
max_inliers_number = cuda.shared.array(shape=1, dtype=nb.int32)
# this mutex is needed to make sure that only one thread writes to the mask
mutex = cuda.shared.array(shape=1, dtype=nb.int32)
if thread_id == 0:
max_inliers_number[0] = 0
mutex[0] = 0
cuda.syncthreads()

# replace the maximum number of inliers if the current number is greater
cuda.atomic.max(max_inliers_number, 0, inliers_number_local)

# if this thread has the maximum number of inliers
# write this thread's plane to the shared memory
best_plane = cuda.shared.array(shape=4, dtype=nb.float32)
cuda.syncthreads()
if (
inliers_number_local == max_inliers_number[block_id]
and cuda.atomic.cas(mask_mutex, block_id, 0, 1) == 0
inliers_number_local == max_inliers_number[0]
and cuda.atomic.compare_and_swap(mutex, 0, 1) == 0
):
for i in range(4):
best_plane[i] = plane[i]
cuda.syncthreads()

# parallelize mask computation among threads in the block
# parallelize final mask computation among threads in the block
for i in range(
block_start_indices[block_id] + thread_id,
block_start_indices[block_id] + block_sizes[block_id],
Expand Down

0 comments on commit 0f094cb

Please sign in to comment.