From 9d77fafcc5b6d0975af9ee938acf21597b7ab129 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:56:02 -0600 Subject: [PATCH] Change to device_allocate_and_pack to return unique_ptr The unique_ptr owns the allocation ensuring no leaks during exception handling. This also allows async_smart_free to be used to schedule asynchronous deallocation of USM temporaries. --- .../libtensor/include/utils/offset_utils.hpp | 22 +- .../tensor/libtensor/source/accumulators.cpp | 35 ++-- .../accumulators/accumulate_over_axis.hpp | 48 ++--- .../source/boolean_advanced_indexing.cpp | 141 +++++-------- dpctl/tensor/libtensor/source/clip.cpp | 19 +- .../source/copy_and_cast_usm_to_usm.cpp | 19 +- .../libtensor/source/copy_as_contig.cpp | 84 +++----- .../libtensor/source/copy_for_reshape.cpp | 20 +- .../tensor/libtensor/source/copy_for_roll.cpp | 41 ++-- .../copy_numpy_ndarray_into_usm_ndarray.cpp | 12 +- .../source/elementwise_functions/add.cpp | 8 +- .../elementwise_functions.hpp | 64 ++---- .../elementwise_functions/true_divide.cpp | 23 +-- dpctl/tensor/libtensor/source/full_ctor.cpp | 21 +- .../source/integer_advanced_indexing.cpp | 189 +++++------------- .../libtensor/source/linalg_functions/dot.cpp | 95 ++++----- .../source/reductions/reduction_over_axis.hpp | 95 +++------ dpctl/tensor/libtensor/source/repeat.cpp | 143 +++++-------- .../libtensor/source/sorting/searchsorted.cpp | 20 +- dpctl/tensor/libtensor/source/triul_ctor.cpp | 13 +- dpctl/tensor/libtensor/source/where.cpp | 25 +-- 21 files changed, 392 insertions(+), 745 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index bd2b67afcb..1438def12a 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -28,10 +28,13 @@ #include #include -#include +#include // for std::make_shared, std::unique_ptr #include +#include // for std::move, std::forward #include +#include + #include "kernels/dpctl_tensor_types.hpp" #include "utils/strided_iters.hpp" #include "utils/sycl_alloc_utils.hpp" @@ -84,7 +87,9 @@ std::vector concat(std::vector lhs, Vs &&...vs) } // namespace detail template -std::tuple +std::tuple, + std::size_t, + sycl::event> device_allocate_and_pack(sycl::queue &q, std::vector &host_task_events, Vs &&...vs) @@ -105,25 +110,24 @@ device_allocate_and_pack(sycl::queue &q, std::make_shared(std::move(packed_shape_strides)); auto sz = packed_shape_strides_owner->size(); - indT *shape_strides = sycl::malloc_device(sz, q); - - if (shape_strides == nullptr) { - return std::make_tuple(shape_strides, 0, sycl::event()); - } + auto shape_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(sz, q); + indT *shape_strides = shape_strides_owner.get(); sycl::event copy_ev = q.copy(packed_shape_strides_owner->data(), shape_strides, sz); sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(copy_ev); - cgh.host_task([packed_shape_strides_owner] { + cgh.host_task([packed_shape_strides_owner = + std::move(packed_shape_strides_owner)] { // increment shared pointer ref-count to keep it alive // till copy operation completes; }); }); host_task_events.push_back(cleanup_host_task_ev); - return std::make_tuple(shape_strides, sz, copy_ev); + return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev); } struct NoOpIndexer diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index 5430fcc58c..9ab2b3c659 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -196,14 +196,11 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, : mask_positions_strided_i64_dispatch_vector[mask_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, compact_shape, compact_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - sycl::event::wait(host_task_events); - throw std::runtime_error("Unexpected error"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { { @@ -212,8 +209,8 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, copy_shape_ev.wait(); sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.release(); } throw std::runtime_error("Unexpected error"); } @@ -233,8 +230,8 @@ std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, cumsum_data, host_task_events, dependent_events); sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.release(); } return total_set; @@ -356,14 +353,11 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, compact_shape, compact_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - sycl::event::wait(host_task_events); - throw std::runtime_error("Unexpected error"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { { @@ -371,9 +365,10 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, copy_shape_ev.wait(); sycl::event::wait(host_task_events); + + // ensure USM deleter is called with GIL released + shape_strides_owner.release(); } - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); throw std::runtime_error("Unexpected error"); } @@ -391,8 +386,8 @@ std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, py::gil_scoped_release release; sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // ensure USM deleter is called with GIL released + shape_strides_owner.release(); } return total; diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp index bc8306a5a4..2352b6ab30 100644 --- a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -200,18 +200,18 @@ py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_iter_shape, simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, acc_src_strides, acc_dst_strides); - py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple); - if (packed_shapes_and_strides == nullptr) { - throw std::runtime_error("Unexpected error"); - } + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); - py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; - py::ssize_t *acc_shapes_and_strides = + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = packed_shapes_and_strides + 3 * simplified_iter_shape.size(); std::vector all_deps; @@ -224,14 +224,8 @@ py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, acc_shapes_and_strides, dst_data, host_task_events, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(acc_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_and_strides] { - sycl_free_noexcept(packed_shapes_and_strides, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); host_task_events.push_back(temp_cleanup_ev); return std::make_pair( @@ -384,18 +378,18 @@ std::pair py_accumulate_final_axis_include_initial( } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_iter_shape, simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, acc_src_strides, acc_dst_strides); - py::ssize_t *packed_shapes_and_strides = std::get<0>(ptr_size_event_tuple); - if (packed_shapes_and_strides == nullptr) { - throw std::runtime_error("Unexpected error"); - } + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); - py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; - py::ssize_t *acc_shapes_and_strides = + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = packed_shapes_and_strides + 3 * simplified_iter_shape.size(); std::vector all_deps; @@ -408,14 +402,8 @@ std::pair py_accumulate_final_axis_include_initial( iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, acc_shapes_and_strides, dst_data, host_task_events, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(acc_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_and_strides] { - sycl_free_noexcept(packed_shapes_and_strides, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); host_task_events.push_back(temp_cleanup_ev); return std::make_pair( diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 843474a265..b4ec15c96f 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -35,8 +35,6 @@ #include #include -#include "boolean_advanced_indexing.hpp" -#include "kernels/boolean_advanced_indexing.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" @@ -44,6 +42,9 @@ #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" +#include "boolean_advanced_indexing.hpp" +#include "kernels/boolean_advanced_indexing.hpp" + namespace dpctl { namespace tensor @@ -278,16 +279,14 @@ py_extract(const dpctl::tensor::usm_ndarray &src, [src_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocated device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_src_shape_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -301,14 +300,8 @@ py_extract(const dpctl::tensor::usm_ndarray &src, dst_shape_vec[0], dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(extract_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_src_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } } @@ -370,19 +363,19 @@ py_extract(const dpctl::tensor::usm_ndarray &src, assert(masked_dst_strides.size() == 1); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_ortho_shape, - simplified_ortho_src_strides, simplified_ortho_dst_strides, - masked_src_shape, masked_src_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_src_strides, simplified_ortho_dst_strides, + masked_src_shape, masked_src_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); - py::ssize_t *packed_ortho_src_dst_shape_strides = packed_shapes_strides; - py::ssize_t *packed_masked_src_shape_strides = + const py::ssize_t *packed_ortho_src_dst_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_src_shape_strides = packed_shapes_strides + (3 * ortho_nd); std::vector all_deps; @@ -405,14 +398,8 @@ py_extract(const dpctl::tensor::usm_ndarray &src, masked_dst_shape[0], masked_dst_strides[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(extract_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -601,16 +588,14 @@ py_place(const dpctl::tensor::usm_ndarray &dst, assert(rhs_strides_vec.size() == 1); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, dst_shape_vec, dst_strides_vec); - py::ssize_t *packed_dst_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_dst_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, dst_shape_vec, dst_strides_vec); + auto packed_dst_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_dst_shape_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_dst_shape_strides = + packed_dst_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -624,14 +609,8 @@ py_place(const dpctl::tensor::usm_ndarray &dst, rhs_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(place_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_dst_shape_strides] { - sycl_free_noexcept(packed_dst_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_dst_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } else { @@ -691,19 +670,19 @@ py_place(const dpctl::tensor::usm_ndarray &dst, assert(masked_rhs_strides.size() == 1); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_ortho_shape, - simplified_ortho_dst_strides, simplified_ortho_rhs_strides, - masked_dst_shape, masked_dst_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_dst_strides, simplified_ortho_rhs_strides, + masked_dst_shape, masked_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); - py::ssize_t *packed_ortho_dst_rhs_shape_strides = packed_shapes_strides; - py::ssize_t *packed_masked_dst_shape_strides = + const py::ssize_t *packed_ortho_dst_rhs_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_dst_shape_strides = packed_shapes_strides + (3 * ortho_nd); std::vector all_deps; @@ -724,14 +703,8 @@ py_place(const dpctl::tensor::usm_ndarray &dst, masked_rhs_shape[0], masked_rhs_strides[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(place_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -828,15 +801,12 @@ py_nonzero(const dpctl::tensor::usm_ndarray host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &mask_shape_copying_tuple = - device_allocate_and_pack(exec_q, host_task_events, - mask_shape); - py::ssize_t *src_shape_device_ptr = std::get<0>(mask_shape_copying_tuple); - if (src_shape_device_ptr == nullptr) { - sycl::event::wait(host_task_events); - throw std::runtime_error("Device allocation failed"); - } + auto mask_shape_copying_tuple = device_allocate_and_pack( + exec_q, host_task_events, mask_shape); + auto src_shape_device_owner = + std::move(std::get<0>(mask_shape_copying_tuple)); sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple); + const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -860,14 +830,9 @@ py_nonzero(const dpctl::tensor::usm_ndarray fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(), indexes.get_data(), src_shape_device_ptr, all_deps); - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(non_zero_indexes_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, src_shape_device_ptr] { - sycl_free_noexcept(src_shape_device_ptr, ctx); - }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {non_zero_indexes_ev}, src_shape_device_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp index e381ea7b96..1149e26bd1 100644 --- a/dpctl/tensor/libtensor/source/clip.cpp +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -228,11 +228,10 @@ py_clip(const dpctl::tensor::usm_ndarray &src, // common shape and strides simplified_shape, simplified_src_strides, simplified_min_strides, simplified_max_strides, simplified_dst_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (!packed_shape_strides) { - throw std::runtime_error("USM-host memory allocation failure"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -246,15 +245,9 @@ py_clip(const dpctl::tensor::usm_ndarray &src, min_offset, max_offset, dst_offset, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(clip_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shape_strides, ctx]() { - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {clip_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev = diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 86bb0ac064..4c6946505b 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -251,28 +251,21 @@ copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src, host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides, simplified_dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); const sycl::event ©_and_cast_generic_ev = copy_and_cast_fn( exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, dst_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_and_cast_generic_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); - + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_and_cast_generic_ev}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp index 3eba902e14..04ddef3495 100644 --- a/dpctl/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl/tensor/libtensor/source/copy_as_contig.cpp @@ -222,15 +222,12 @@ py_as_c_contig(const dpctl::tensor::usm_ndarray &src, } std::vector host_task_events{}; - const auto &ptr_size_event_tuple = + auto ptr_size_event_tuple = dpctl::tensor::offset_utils::device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - - py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple); - if (shape_stride == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; @@ -244,14 +241,9 @@ py_as_c_contig(const dpctl::tensor::usm_ndarray &src, ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), dst.get_data(), all_depends); - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); }); - }); - + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -358,15 +350,12 @@ py_as_f_contig(const dpctl::tensor::usm_ndarray &src, } std::vector host_task_events{}; - const auto &ptr_size_event_tuple = + auto ptr_size_event_tuple = dpctl::tensor::offset_utils::device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - - py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple); - if (shape_stride == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; @@ -380,14 +369,9 @@ py_as_f_contig(const dpctl::tensor::usm_ndarray &src, ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), dst.get_data(), all_depends); - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); }); - }); - + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -551,13 +535,12 @@ py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (nullptr == packed_shape_strides) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_depends; all_depends.reserve(depends.size() + 1); @@ -571,15 +554,9 @@ py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, dst_strides_vec[src_nd - 2], all_depends); // async free of shape_strides temporary - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - - cgh.host_task([ctx, packed_shape_strides]() { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -737,13 +714,12 @@ py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (nullptr == packed_shape_strides) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_depends; all_depends.reserve(depends.size() + 1); @@ -756,16 +732,10 @@ py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, n, src.get_data(), src_strides_vec.front(), dst.get_data(), dst_strides_vec[1], all_depends); - // async free of shape_strides temporary - const auto &ctx = exec_q.get_context(); - const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(ascontig_ev); - - cgh.host_task([ctx, packed_shape_strides]() { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); + // async free of shape_strides + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp index 8fee94dcb0..eb404a4543 100644 --- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp @@ -133,14 +133,12 @@ copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, // shape_strides = [src_shape, src_strides, dst_shape, dst_strides] using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, src_shape, src_strides, dst_shape, dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } - sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const py::ssize_t *shape_strides = shape_strides_owner.get(); const char *src_data = src.get_data(); char *dst_data = dst.get_data(); @@ -153,13 +151,9 @@ copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data, dst_data, all_deps); - auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_for_reshape_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [shape_strides, ctx]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_reshape_event}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp index 4d72df907b..ef63060b4f 100644 --- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp @@ -218,15 +218,12 @@ copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, // shape_strides = [src_shape, src_strides, dst_strides] using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides, simplified_dst_strides); - - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); std::vector all_deps(depends.size() + 1); all_deps.push_back(copy_shape_ev); @@ -236,14 +233,9 @@ copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data, src_offset, dst_data, dst_offset, all_deps); - auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_for_roll_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [shape_strides, ctx]() { sycl_free_noexcept(shape_strides, ctx); }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), @@ -349,15 +341,13 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, // shape_strides = [src_shape, src_strides, dst_strides] using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, common_shape, src_strides, dst_strides, normalized_shifts); - - py::ssize_t *shape_strides_shifts = std::get<0>(ptr_size_event_tuple); - if (shape_strides_shifts == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_shifts_owner = + std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get(); std::vector all_deps(depends.size() + 1); all_deps.push_back(copy_shape_ev); @@ -367,15 +357,8 @@ copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data, src_offset, dst_data, dst_offset, all_deps); - auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_for_roll_event); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([shape_strides_shifts, ctx]() { - sycl_free_noexcept(shape_strides_shifts, ctx); - }); - }); - + auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_shifts_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index 23a392397f..9b7894eb4c 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -303,14 +303,12 @@ void copy_numpy_ndarray_into_usm_ndarray( // Copy shape strides into device memory using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( + auto ptr_size_event_tuple = device_allocate_and_pack( exec_q, host_task_events, simplified_shape, simplified_src_strides, simplified_dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); { // release GIL for the blocking call @@ -326,8 +324,8 @@ void copy_numpy_ndarray_into_usm_ndarray( npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, dst_offset, depends, {copy_shape_ev}); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(shape_strides, exec_q); + // invoke USM deleter in smart pointer while GIL is held + shape_strides_owner.release(); } return; diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp index 9133b2bc26..31a0b7f053 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp @@ -30,13 +30,13 @@ #include #include -#include "add.hpp" -#include "elementwise_functions.hpp" -#include "utils/type_dispatch.hpp" - #include "kernels/elementwise_functions/add.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" +#include "utils/type_dispatch.hpp" + +#include "add.hpp" +#include "elementwise_functions.hpp" namespace py = pybind11; diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 7339597d73..c046321006 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -223,28 +223,21 @@ py_unary_ufunc(const dpctl::tensor::usm_ndarray &src, std::vector host_tasks{}; host_tasks.reserve(2); - const auto &ptr_size_event_triple_ = device_allocate_and_pack( + auto ptr_size_event_triple_ = device_allocate_and_pack( q, host_tasks, simplified_shape, simplified_src_strides, simplified_dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_triple_); - const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_triple_); - - if (shape_strides == nullptr) { - throw std::runtime_error("Device memory allocation failed"); - } + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_)); + const auto ©_shape_ev = std::get<2>(ptr_size_event_triple_); + const py::ssize_t *shape_strides = shape_strides_owner.get(); sycl::event strided_fn_ev = strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, dst_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - auto ctx = q.get_context(); - sycl::event tmp_cleanup_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(strided_fn_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + q, {strided_fn_ev}, shape_strides_owner); + host_tasks.push_back(tmp_cleanup_ev); return std::make_pair( @@ -548,31 +541,21 @@ std::pair py_binary_ufunc( } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_sz_event_triple_ = device_allocate_and_pack( + auto ptr_sz_event_triple_ = device_allocate_and_pack( exec_q, host_tasks, simplified_shape, simplified_src1_strides, simplified_src2_strides, simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto ©_shape_ev = std::get<2>(ptr_sz_event_triple_); - py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_); - const sycl::event ©_shape_ev = std::get<2>(ptr_sz_event_triple_); - - if (shape_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } + const py::ssize_t *shape_strides = shape_strides_owner.get(); sycl::event strided_fn_ev = strided_fn( exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset, src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - auto ctx = exec_q.get_context(); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(strided_fn_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); - + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {strided_fn_ev}, shape_strides_owner); host_tasks.push_back(tmp_cleanup_ev); return std::make_pair( @@ -802,30 +785,21 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_sz_event_triple_ = device_allocate_and_pack( + auto ptr_sz_event_triple_ = device_allocate_and_pack( exec_q, host_tasks, simplified_shape, simplified_rhs_strides, simplified_lhs_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto copy_shape_ev = std::get<2>(ptr_sz_event_triple_); - py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_); - const sycl::event ©_shape_ev = std::get<2>(ptr_sz_event_triple_); - - if (shape_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } + const py::ssize_t *shape_strides = shape_strides_owner.get(); sycl::event strided_fn_ev = strided_fn(exec_q, rhs_nelems, nd, shape_strides, rhs_data, rhs_offset, lhs_data, lhs_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary - auto ctx = exec_q.get_context(); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(strided_fn_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {strided_fn_ev}, shape_strides_owner); host_tasks.push_back(tmp_cleanup_ev); diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp index 9b11aa022e..0e3fb38015 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -28,7 +28,7 @@ #include #include #include -#include +#include // for std::ignore #include #include "dpctl4pybind11.hpp" @@ -379,12 +379,13 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_sz_event_triple_ = device_allocate_and_pack( + auto ptr_sz_event_triple_ = device_allocate_and_pack( exec_q, host_tasks, simplified_shape, simplified_src_strides, simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_)); + auto ©_metadata_ev = std::get<2>(ptr_sz_event_triple_); - py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_); - const sycl::event ©_metadata_ev = std::get<2>(ptr_sz_event_triple_); + const py::ssize_t *shape_strides = shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -392,23 +393,13 @@ py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src, std::copy(depends.begin(), depends.end(), all_deps.begin()); all_deps.push_back(copy_metadata_ev); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } - sycl::event div_ev = fn(exec_q, src_nelems, nd, shape_strides, src_data, src_offset, scalar_alloc, dst_data, dst_offset, all_deps); // async free of shape_strides temporary - auto ctx = exec_q.get_context(); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(div_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, shape_strides]() { sycl_free_noexcept(shape_strides, ctx); }); - }); + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {div_ev}, shape_strides_owner); host_tasks.push_back(tmp_cleanup_ev); diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index 393455b23f..fe668422a0 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -261,29 +261,20 @@ usm_ndarray_full(const py::object &py_value, std::vector host_task_events; host_task_events.reserve(2); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = - device_allocate_and_pack(exec_q, host_task_events, - dst_shape, dst_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, dst_shape, dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + py::ssize_t *shape_strides = shape_strides_owner.get(); const sycl::event &full_strided_ev = fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data, {copy_shape_ev}); // free shape_strides - const auto &ctx = exec_q.get_context(); const auto &temporaries_cleanup_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(full_strided_ev); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, shape_strides]() { - sycl_free_noexcept(shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {full_strided_ev}, shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events), diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index f1790ec6be..5eb54bbe70 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -193,9 +193,12 @@ _populate_kernel_params(sycl::queue &exec_q, device_orthog_sh_st_copy_ev, device_ind_offsets_copy_ev, device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev}); - cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp, - host_ind_ptrs_shp, host_orthog_sh_st_shp, - host_along_sh_st_shp]() {}); + cgh.host_task( + [host_ind_offsets_shp = std::move(host_ind_offsets_shp), + host_ind_sh_st_shp = std::move(host_ind_sh_st_shp), + host_ind_ptrs_shp = std::move(host_ind_ptrs_shp), + host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp), + host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {}); }); host_task_events.push_back(shared_ptr_cleanup_ev); @@ -424,38 +427,24 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, return std::make_pair(sycl::event{}, sycl::event{}); } - char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); - - if (packed_ind_ptrs == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_ptrs device memory"); - } + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); // rearrange to past where indices shapes are checked // packed_ind_shapes_strides = [ind_shape, // ind[0] strides, // ..., // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); py::ssize_t *packed_ind_shapes_strides = - sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); - - if (packed_ind_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_shapes_strides device memory"); - } - - py::ssize_t *packed_ind_offsets = - sycl::malloc_device(k, exec_q); + packed_ind_shapes_strides_owner.get(); - if (packed_ind_offsets == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_offsets device memory"); - } + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); int orthog_sh_elems = std::max(src_nd - k, 1); @@ -463,34 +452,20 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, // src_strides[:axis] + src_strides[axis+k:], // dst_strides[:axis] + // dst_strides[axis+ind.ndim:]] - py::ssize_t *packed_shapes_strides = - sycl::malloc_device(3 * orthog_sh_elems, exec_q); - - if (packed_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - throw std::runtime_error( - "Unable to allocate packed_shapes_strides device memory"); - } + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); // packed_axes_shapes_strides = [src_shape[axis:axis+k], // src_strides[axis:axis+k], // dst_shape[axis:axis+ind.ndim], // dst_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); py::ssize_t *packed_axes_shapes_strides = - sycl::malloc_device(2 * (k + ind_sh_elems), exec_q); - - if (packed_axes_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_axes_shapes_strides device memory"); - } + packed_axes_shapes_strides_owner.get(); auto src_strides = src.get_strides_vector(); auto dst_strides = dst.get_strides_vector(); @@ -515,12 +490,6 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, if (fn == nullptr) { sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - sycl_free_noexcept(packed_axes_shapes_strides, exec_q); throw std::runtime_error("Indices must be integer type, got " + std::to_string(ind_type_id)); } @@ -532,21 +501,11 @@ usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, src_offset, dst_offset, packed_ind_offsets, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(take_generic_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, - packed_ind_shapes_strides, packed_ind_ptrs, - packed_ind_offsets, ctx]() { - sycl_free_noexcept(packed_shapes_strides, ctx); - sycl_free_noexcept(packed_axes_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_ptrs, ctx); - sycl_free_noexcept(packed_ind_offsets, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {take_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev = @@ -738,37 +697,23 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, return std::make_pair(sycl::event{}, sycl::event{}); } - char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); - - if (packed_ind_ptrs == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_ptrs device memory"); - } + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); // packed_ind_shapes_strides = [ind_shape, // ind[0] strides, // ..., // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); py::ssize_t *packed_ind_shapes_strides = - sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); - - if (packed_ind_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_shapes_strides device memory"); - } - - py::ssize_t *packed_ind_offsets = - sycl::malloc_device(k, exec_q); + packed_ind_shapes_strides_owner.get(); - if (packed_ind_offsets == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_ind_offsets device memory"); - } + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); int orthog_sh_elems = std::max(dst_nd - k, 1); @@ -776,34 +721,20 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, // dst_strides[:axis] + dst_strides[axis+k:], // val_strides[:axis] + // val_strides[axis+ind.ndim:]] - py::ssize_t *packed_shapes_strides = - sycl::malloc_device(3 * orthog_sh_elems, exec_q); - - if (packed_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - throw std::runtime_error( - "Unable to allocate packed_shapes_strides device memory"); - } + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); // packed_axes_shapes_strides = [dst_shape[axis:axis+k], // dst_strides[axis:axis+k], // val_shape[axis:axis+ind.ndim], // val_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); py::ssize_t *packed_axes_shapes_strides = - sycl::malloc_device(2 * (k + ind_sh_elems), exec_q); - - if (packed_axes_shapes_strides == nullptr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - throw std::runtime_error( - "Unable to allocate packed_axes_shapes_strides device memory"); - } + packed_axes_shapes_strides_owner.get(); auto dst_strides = dst.get_strides_vector(); auto val_strides = val.get_strides_vector(); @@ -828,12 +759,6 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, if (fn == nullptr) { sycl::event::wait(host_task_events); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(packed_ind_ptrs, exec_q); - sycl_free_noexcept(packed_ind_shapes_strides, exec_q); - sycl_free_noexcept(packed_ind_offsets, exec_q); - sycl_free_noexcept(packed_shapes_strides, exec_q); - sycl_free_noexcept(packed_axes_shapes_strides, exec_q); throw std::runtime_error("Indices must be integer type, got " + std::to_string(ind_type_id)); } @@ -845,21 +770,11 @@ usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, dst_offset, val_offset, packed_ind_offsets, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(put_generic_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, - packed_ind_shapes_strides, packed_ind_ptrs, - packed_ind_offsets, ctx]() { - sycl_free_noexcept(packed_shapes_strides, ctx); - sycl_free_noexcept(packed_axes_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_shapes_strides, ctx); - sycl_free_noexcept(packed_ind_ptrs, ctx); - sycl_free_noexcept(packed_ind_offsets, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {put_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev = diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp index bb79a19789..ce267baa1b 100644 --- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp +++ b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp @@ -477,7 +477,7 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -486,16 +486,14 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, // reduction metadata simplified_inner_shape, simplified_inner_x1_strides, simplified_inner_x2_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_alloc_owner = + std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *inner_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *inner_shape_stride = temp_allocation_ptr + 4 * simplified_batch_shape.size(); std::vector all_deps; @@ -511,14 +509,9 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, inner_nd, // number dimensions being reduced inner_shape_stride, inner_x1_offset, inner_x2_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dot_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {dot_ev}, + tmp_alloc_owner); host_task_events.push_back(temp_cleanup_ev); } else { // if (!call_vecdot) @@ -557,18 +550,16 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, } } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, x1_shape_vec, x1_strides_vec, - x2_shape_vec, x2_strides_vec, dst_shape_vec, - dst_strides_vec); - py::ssize_t *packed_shapes_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, x1_shape_vec, x1_strides_vec, + x2_shape_vec, x2_strides_vec, dst_shape_vec, dst_strides_vec); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + const py::ssize_t *x1_shape_strides = packed_shapes_strides; const py::ssize_t *x2_shape_strides = packed_shapes_strides + 2 * (x1_nd); @@ -588,14 +579,8 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, x1_outer_dims + x2_outer_dims, dst_shape_strides, all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dot_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dot_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } else { // if (call_batched) @@ -751,25 +736,23 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, } } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_batch_shape, - simplified_batch_x1_strides, simplified_batch_x2_strides, - simplified_batch_dst_strides, outer_inner_x1_shape, - outer_inner_x1_strides, outer_inner_x2_shape, - outer_inner_x2_strides, outer_inner_dst_shape, - outer_inner_dst_strides, - // full shape and strides of the result array - // necessary for reduction and initialization - simplified_batch_shape, outer_inner_dst_shape, - simplified_batch_dst_strides, outer_inner_dst_strides); - py::ssize_t *packed_shapes_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_batch_shape, + simplified_batch_x1_strides, simplified_batch_x2_strides, + simplified_batch_dst_strides, outer_inner_x1_shape, + outer_inner_x1_strides, outer_inner_x2_shape, + outer_inner_x2_strides, outer_inner_dst_shape, + outer_inner_dst_strides, + // full shape and strides of the result array + // necessary for reduction and initialization + simplified_batch_shape, outer_inner_dst_shape, + simplified_batch_dst_strides, outer_inner_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); const auto batch_shape_strides = packed_shapes_strides; const auto x1_outer_inner_shapes_strides = @@ -799,14 +782,8 @@ py_dot(const dpctl::tensor::usm_ndarray &x1, dst_outer_shapes_strides, dst_full_shape_strides, all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dot_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dot_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } } diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index f5d70e4f2a..9458374482 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -459,7 +459,7 @@ std::pair py_reduction_over_axis( std::vector host_task_events{}; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -467,15 +467,13 @@ std::pair py_reduction_over_axis( simplified_iteration_dst_strides, // reduction metadata simplified_reduction_shape, simplified_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_alloc_owner = + std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = temp_allocation_ptr + 3 * simplified_iteration_shape.size(); std::vector all_deps; @@ -491,14 +489,8 @@ std::pair py_reduction_over_axis( reduction_nd, // number dimensions being reduced reduction_shape_stride, reduction_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(reduction_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_alloc_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = @@ -750,7 +742,7 @@ std::pair py_tree_reduction_over_axis( std::vector host_task_events{}; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -758,15 +750,12 @@ std::pair py_tree_reduction_over_axis( simplified_iteration_dst_strides, // reduction metadata simplified_reduction_shape, simplified_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = temp_allocation_ptr + 3 * simplified_iteration_shape.size(); std::vector all_deps; @@ -782,14 +771,8 @@ std::pair py_tree_reduction_over_axis( reduction_nd, // number dimensions being reduced reduction_shape_stride, reduction_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(reduction_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = @@ -1032,7 +1015,7 @@ std::pair py_search_over_axis( using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &arrays_metainfo_packing_triple_ = + auto arrays_metainfo_packing_triple_ = device_allocate_and_pack( exec_q, host_task_events, // iteration metadata @@ -1040,15 +1023,12 @@ std::pair py_search_over_axis( simplified_iteration_dst_strides, // reduction metadata compact_reduction_shape, compact_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = temp_allocation_ptr + 3 * simplified_iteration_shape.size(); std::vector all_deps; @@ -1063,14 +1043,8 @@ std::pair py_search_over_axis( reduction_nd, // number dimensions being reduced reduction_shape_stride, reduction_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, temp_allocation_ptr] { - sycl_free_noexcept(temp_allocation_ptr, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, tmp_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = @@ -1301,21 +1275,20 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, auto fn = strided_dispatch_vector[src_typeid]; std::vector host_task_events{}; - const auto &iter_red_metadata_packing_triple_ = + auto iter_red_metadata_packing_triple_ = dpctl::tensor::offset_utils::device_allocate_and_pack( exec_q, host_task_events, simplified_iter_shape, simplified_iter_src_strides, simplified_iter_dst_strides, simplified_red_shape, simplified_red_src_strides); - py::ssize_t *packed_shapes_and_strides = - std::get<0>(iter_red_metadata_packing_triple_); - if (packed_shapes_and_strides == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } + auto packed_shapes_strides_owner = + std::move(std::get<0>(iter_red_metadata_packing_triple_)); const auto ©_metadata_ev = std::get<2>(iter_red_metadata_packing_triple_); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_strides_owner.get(); - py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; - py::ssize_t *red_shape_stride = + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *red_shape_stride = packed_shapes_and_strides + 3 * simplified_iter_shape.size(); std::vector all_deps; @@ -1329,14 +1302,8 @@ py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, iter_shape_and_strides, iter_src_offset, iter_dst_offset, simplified_red_nd, red_shape_stride, red_src_offset, all_deps); - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(red_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_and_strides] { - sycl_free_noexcept(packed_shapes_and_strides, ctx); - }); - }); + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {red_ev}, packed_shapes_strides_owner); host_task_events.push_back(temp_cleanup_ev); sycl::event keep_args_event = diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index f0df192876..25146eac88 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -35,13 +35,14 @@ #include #include "kernels/repeat.hpp" -#include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" +#include "simplify_iteration_space.hpp" + namespace dpctl { namespace tensor @@ -239,15 +240,13 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -263,14 +262,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } else { @@ -318,15 +311,14 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, orthog_dst_offset); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_orthog_shape, - simplified_orthog_src_strides, simplified_orthog_dst_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -348,14 +340,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, reps_shape_vec[0], reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -475,13 +461,13 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + auto ptr_size_event_tuple1 = device_allocate_and_pack( exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_src_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_src_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shapes_strides = + packed_src_shapes_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -496,14 +482,8 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, reps_shape_vec[0], reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shapes_strides] { - sycl_free_noexcept(packed_src_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( @@ -617,15 +597,13 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, } using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = - std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -639,14 +617,9 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); } else { @@ -695,15 +668,14 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, orthog_dst_offset); using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = - device_allocate_and_pack( - exec_q, host_task_events, simplified_orthog_shape, - simplified_orthog_src_strides, simplified_orthog_dst_strides); - py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -723,14 +695,8 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, axis_dst_shape[0], axis_dst_stride[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_shapes_strides] { - sycl_free_noexcept(packed_shapes_strides, ctx); - }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); } @@ -814,13 +780,13 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + auto ptr_size_event_tuple1 = device_allocate_and_pack( exec_q, host_task_events, src_shape_vec, src_strides_vec); - py::ssize_t *packed_src_shape_strides = std::get<0>(ptr_size_event_tuple1); - if (packed_src_shape_strides == nullptr) { - throw std::runtime_error("Unable to allocate device memory"); - } + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -834,15 +800,8 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, dst_shape_vec[0], dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(repeat_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, packed_src_shape_strides] { - sycl_free_noexcept(packed_src_shape_strides, ctx); - }); - }); - + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); host_task_events.push_back(cleanup_tmp_allocations_ev); sycl::event py_obj_management_host_task_ev = diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp index 76cb41595d..174214e4c9 100644 --- a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp +++ b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp @@ -376,14 +376,11 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, // vectors being packed simplified_common_shape, simplified_needles_strides, simplified_positions_strides); - - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); const sycl::event ©_shape_strides_ev = std::get<2>(ptr_size_event_tuple); - - if (!packed_shape_strides) { - throw std::runtime_error("USM-host allocation failure"); - } + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -411,14 +408,9 @@ py_searchsorted(const dpctl::tensor::usm_ndarray &hay, simplified_nd, packed_shape_strides, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shape_strides, ctx]() { - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); const sycl::event &ht_ev = dpctl::utils::keep_args_alive( diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp index 264de8f36d..24bf7b322f 100644 --- a/dpctl/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp @@ -176,11 +176,11 @@ usm_ndarray_triul(sycl::queue &exec_q, (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; - py::ssize_t *dev_shape_and_strides = - sycl::malloc_device(3 * nd, exec_q); - if (dev_shape_and_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } + auto dev_shape_and_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(3 * nd, + exec_q); + py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get(); + const sycl::event ©_shape_and_strides = exec_q.copy( shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); @@ -212,6 +212,9 @@ usm_ndarray_triul(sycl::queue &exec_q, sycl_free_noexcept(dev_shape_and_strides, ctx); }); }); + // since host_task now owns USM allocation, release ownership by smart + // pointer + dev_shape_and_strides_owner.release(); return std::make_pair( keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev); diff --git a/dpctl/tensor/libtensor/source/where.cpp b/dpctl/tensor/libtensor/source/where.cpp index 3a50eb309c..9825b65901 100644 --- a/dpctl/tensor/libtensor/source/where.cpp +++ b/dpctl/tensor/libtensor/source/where.cpp @@ -27,21 +27,23 @@ #include #include #include -#include #include +#include + #include "dpctl4pybind11.hpp" #include #include #include #include "kernels/where.hpp" -#include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" #include "where.hpp" namespace dpctl @@ -211,11 +213,10 @@ py_where(const dpctl::tensor::usm_ndarray &condition, // common shape and strides simplified_shape, simplified_cond_strides, simplified_x1_strides, simplified_x2_strides, simplified_dst_strides); - py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); - if (!packed_shape_strides) { - throw std::runtime_error("USM-host memory allocation failure"); - } + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); std::vector all_deps; all_deps.reserve(depends.size() + 1); @@ -229,15 +230,9 @@ py_where(const dpctl::tensor::usm_ndarray &condition, x1_offset, x2_offset, dst_offset, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(where_ev); - const auto &ctx = exec_q.get_context(); - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([packed_shape_strides, ctx]() { - sycl_free_noexcept(packed_shape_strides, ctx); - }); - }); - + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {where_ev}, packed_shape_strides_owner); host_task_events.push_back(temporaries_cleanup_ev); sycl::event arg_cleanup_ev =