Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MAINT] Use std::size_t from cstddef and use dpctl::tensor::ssize_t where ssize_t is used #1950

Merged
merged 10 commits into from
Jan 7, 2025
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Add support of CV-qualifiers in `is_complex<T>` helper [gh-1900](https://github.com/IntelPython/dpctl/pull/1900)
* Tuning work for elementwise functions with modest performance gains (under 10%) [gh-1889](https://github.com/IntelPython/dpctl/pull/1889)
* Support for Python 3.13 for `dpctl` [gh-1941](https://github.com/IntelPython/dpctl/pull/1941)
* Change libtensor to use `std::size_t` and `dpctl::tensor::ssize_t` throughout and fix missing includes for `std::size_t` and `size_t` [gh-1950](https://github.com/IntelPython/dpctl/pull/1950)

## [0.18.3] - Dec. 07, 2024

Expand Down
1 change: 1 addition & 0 deletions dpctl/_host_task_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

#pragma once
#include <exception>
#include <stddef.h>
#include <sycl/sycl.hpp>

#include "Python.h"
Expand Down
6 changes: 4 additions & 2 deletions dpctl/apis/include/dpctl4pybind11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@

#include "dpctl_capi.h"
#include <complex>
#include <cstddef> // for std::size_t for C++ linkage
#include <memory>
#include <pybind11/pybind11.h>
#include <stddef.h> // for size_t for C linkage
#include <stdexcept>
#include <sycl/sycl.hpp>
#include <utility>
Expand Down Expand Up @@ -759,7 +761,7 @@ class usm_memory : public py::object
* lifetime of the USM allocation.
*/
usm_memory(void *usm_ptr,
size_t nbytes,
std::size_t nbytes,
const sycl::queue &q,
std::shared_ptr<void> shptr)
{
Expand Down Expand Up @@ -819,7 +821,7 @@ class usm_memory : public py::object
return reinterpret_cast<char *>(MRef);
}

size_t get_nbytes() const
std::size_t get_nbytes() const
{
auto const &api = ::dpctl::detail::dpctl_capi::get();
Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
Expand Down
2 changes: 2 additions & 0 deletions dpctl/tensor/libtensor/include/kernels/accumulators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#pragma once
#include <array>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <new>
Expand All @@ -47,6 +48,7 @@ namespace kernels
namespace accumulators
{

using dpctl::tensor::ssize_t;
using namespace dpctl::tensor::offset_utils;

template <typename T> T ceiling_quotient(T n, T m) { return (n + m - 1) / m; }
Expand Down
2 changes: 1 addition & 1 deletion dpctl/tensor/libtensor/include/kernels/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace kernels
namespace alignment_utils
{

static constexpr size_t required_alignment = 64UL;
static constexpr std::size_t required_alignment = 64UL;

template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
//===---------------------------------------------------------------------===//

#pragma once
#include <cstddef>
#include <cstdint>
#include <limits>
#include <sycl/sycl.hpp>
Expand All @@ -42,6 +43,7 @@ namespace kernels
namespace indexing
{

using dpctl::tensor::ssize_t;
using namespace dpctl::tensor::offset_utils;

template <typename OrthogIndexerT,
Expand All @@ -55,7 +57,7 @@ struct MaskedExtractStridedFunctor
MaskedExtractStridedFunctor(const dataT *src_data_p,
const indT *cumsum_data_p,
dataT *dst_data_p,
size_t masked_iter_size,
std::size_t masked_iter_size,
const OrthogIndexerT &orthog_src_dst_indexer_,
const MaskedSrcIndexerT &masked_src_indexer_,
const MaskedDstIndexerT &masked_dst_indexer_,
Expand All @@ -81,7 +83,7 @@ struct MaskedExtractStridedFunctor

const std::size_t max_offset = masked_nelems + 1;
for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
const size_t offset = masked_block_start + i;
const std::size_t offset = masked_block_start + i;
lacc[i] = (offset == 0) ? indT(0)
: (offset < max_offset) ? cumsum[offset - 1]
: cumsum[masked_nelems - 1] + 1;
Expand All @@ -99,9 +101,10 @@ struct MaskedExtractStridedFunctor
if (mask_set && (masked_i < masked_nelems)) {
const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i);

const size_t total_src_offset = masked_src_indexer(masked_i) +
orthog_offsets.get_first_offset();
const size_t total_dst_offset =
const std::size_t total_src_offset =
masked_src_indexer(masked_i) +
orthog_offsets.get_first_offset();
const std::size_t total_dst_offset =
masked_dst_indexer(current_running_count - 1) +
orthog_offsets.get_second_offset();

Expand All @@ -113,7 +116,7 @@ struct MaskedExtractStridedFunctor
const dataT *src = nullptr;
const indT *cumsum = nullptr;
dataT *dst = nullptr;
const size_t masked_nelems = 0;
const std::size_t masked_nelems = 0;
// has nd, shape, src_strides, dst_strides for
// dimensions that ARE NOT masked
const OrthogIndexerT orthog_src_dst_indexer;
Expand All @@ -136,7 +139,7 @@ struct MaskedPlaceStridedFunctor
MaskedPlaceStridedFunctor(dataT *dst_data_p,
const indT *cumsum_data_p,
const dataT *rhs_data_p,
size_t masked_iter_size,
std::size_t masked_iter_size,
const OrthogIndexerT &orthog_dst_rhs_indexer_,
const MaskedDstIndexerT &masked_dst_indexer_,
const MaskedRhsIndexerT &masked_rhs_indexer_,
Expand All @@ -157,12 +160,12 @@ struct MaskedPlaceStridedFunctor
const std::uint32_t l_i = ndit.get_local_id(1);
const std::uint32_t lws = ndit.get_local_range(1);

const size_t masked_i = ndit.get_global_id(1);
const size_t masked_block_start = masked_i - l_i;
const std::size_t masked_i = ndit.get_global_id(1);
const std::size_t masked_block_start = masked_i - l_i;

const std::size_t max_offset = masked_nelems + 1;
for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
const size_t offset = masked_block_start + i;
const std::size_t offset = masked_block_start + i;
lacc[i] = (offset == 0) ? indT(0)
: (offset < max_offset) ? cumsum[offset - 1]
: cumsum[masked_nelems - 1] + 1;
Expand All @@ -180,9 +183,10 @@ struct MaskedPlaceStridedFunctor
if (mask_set && (masked_i < masked_nelems)) {
const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i);

const size_t total_dst_offset = masked_dst_indexer(masked_i) +
orthog_offsets.get_first_offset();
const size_t total_rhs_offset =
const std::size_t total_dst_offset =
masked_dst_indexer(masked_i) +
orthog_offsets.get_first_offset();
const std::size_t total_rhs_offset =
masked_rhs_indexer(current_running_count - 1) +
orthog_offsets.get_second_offset();

Expand All @@ -194,7 +198,7 @@ struct MaskedPlaceStridedFunctor
dataT *dst = nullptr;
const indT *cumsum = nullptr;
const dataT *rhs = nullptr;
const size_t masked_nelems = 0;
const std::size_t masked_nelems = 0;
// has nd, shape, dst_strides, rhs_strides for
// dimensions that ARE NOT masked
const OrthogIndexerT orthog_dst_rhs_indexer;
Expand Down Expand Up @@ -450,8 +454,8 @@ sycl::event masked_extract_some_slices_strided_impl(

const std::size_t lws = get_lws(masked_extent);

const size_t n_groups = ((masked_extent + lws - 1) / lws);
const size_t orthog_extent = static_cast<size_t>(orthog_nelems);
const std::size_t n_groups = ((masked_extent + lws - 1) / lws);
const std::size_t orthog_extent = static_cast<std::size_t>(orthog_nelems);

sycl::range<2> gRange{orthog_extent, n_groups * lws};
sycl::range<2> lRange{1, lws};
Expand Down Expand Up @@ -809,7 +813,7 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
const std::size_t masked_block_start = group_i * lws;

for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
const size_t offset = masked_block_start + i;
const std::size_t offset = masked_block_start + i;
lacc[i] = (offset == 0) ? indT1(0)
: (offset - 1 < masked_extent)
? cumsum_data[offset - 1]
Expand Down
38 changes: 20 additions & 18 deletions dpctl/tensor/libtensor/include/kernels/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#pragma once
#include <algorithm>
#include <complex>
#include <cstddef>
#include <cstdint>
#include <sycl/sycl.hpp>
#include <type_traits>
Expand All @@ -45,6 +46,7 @@ namespace kernels
namespace clip
{

using dpctl::tensor::ssize_t;
using namespace dpctl::tensor::offset_utils;

using dpctl::tensor::kernels::alignment_utils::
Expand Down Expand Up @@ -85,14 +87,14 @@ template <typename T,
class ClipContigFunctor
{
private:
size_t nelems = 0;
std::size_t nelems = 0;
const T *x_p = nullptr;
const T *min_p = nullptr;
const T *max_p = nullptr;
T *dst_p = nullptr;

public:
ClipContigFunctor(size_t nelems_,
ClipContigFunctor(std::size_t nelems_,
const T *x_p_,
const T *min_p_,
const T *max_p_,
Expand All @@ -110,30 +112,30 @@ class ClipContigFunctor
if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
const std::uint16_t sgSize =
ndit.get_sub_group().get_local_range()[0];
const size_t gid = ndit.get_global_linear_id();
const uint16_t nelems_per_sg = sgSize * nelems_per_wi;
const std::size_t gid = ndit.get_global_linear_id();
const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;

const size_t start =
const std::size_t start =
(gid / sgSize) * (nelems_per_sg - sgSize) + gid;
const size_t end = std::min(nelems, start + nelems_per_sg);
const std::size_t end = std::min(nelems, start + nelems_per_sg);

for (size_t offset = start; offset < end; offset += sgSize) {
for (std::size_t offset = start; offset < end; offset += sgSize) {
dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
}
}
else {
auto sg = ndit.get_sub_group();
const std::uint16_t sgSize = sg.get_max_local_range()[0];

const size_t base =
const std::size_t base =
nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
sg.get_group_id()[0] * sgSize);

if (base + nelems_per_wi * sgSize < nelems) {
sycl::vec<T, vec_sz> dst_vec;
#pragma unroll
for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
const size_t idx = base + it * sgSize;
const std::size_t idx = base + it * sgSize;
auto x_multi_ptr = sycl::address_space_cast<
sycl::access::address_space::global_space,
sycl::access::decorated::yes>(&x_p[idx]);
Expand Down Expand Up @@ -162,8 +164,8 @@ class ClipContigFunctor
}
}
else {
const size_t lane_id = sg.get_local_id()[0];
for (size_t k = base + lane_id; k < nelems; k += sgSize) {
const std::size_t lane_id = sg.get_local_id()[0];
for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
}
}
Expand All @@ -175,7 +177,7 @@ template <typename T, int vec_sz, int n_vecs> class clip_contig_kernel;

typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
sycl::queue &,
size_t,
std::size_t,
const char *,
const char *,
const char *,
Expand All @@ -184,7 +186,7 @@ typedef sycl::event (*clip_contig_impl_fn_ptr_t)(

template <typename T>
sycl::event clip_contig_impl(sycl::queue &q,
size_t nelems,
std::size_t nelems,
const char *x_cp,
const char *min_cp,
const char *max_cp,
Expand All @@ -199,10 +201,10 @@ sycl::event clip_contig_impl(sycl::queue &q,
sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

size_t lws = 64;
std::size_t lws = 64;
constexpr std::uint8_t vec_sz = 4;
constexpr std::uint8_t n_vecs = 2;
const size_t n_groups =
const std::size_t n_groups =
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);
Expand Down Expand Up @@ -258,7 +260,7 @@ template <typename T, typename IndexerT> class ClipStridedFunctor

void operator()(sycl::id<1> id) const
{
size_t gid = id[0];
std::size_t gid = id[0];
auto offsets = indexer(static_cast<ssize_t>(gid));
dst_p[offsets.get_fourth_offset()] = clip(
x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
Expand All @@ -270,7 +272,7 @@ template <typename T, typename IndexerT> class clip_strided_kernel;

typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
sycl::queue &,
size_t,
std::size_t,
int,
const char *,
const char *,
Expand All @@ -285,7 +287,7 @@ typedef sycl::event (*clip_strided_impl_fn_ptr_t)(

template <typename T>
sycl::event clip_strided_impl(sycl::queue &q,
size_t nelems,
std::size_t nelems,
int nd,
const char *x_cp,
const char *min_cp,
Expand Down
Loading
Loading